Index: cddl/contrib/opensolaris/cmd/zdb/zdb.c =================================================================== --- cddl/contrib/opensolaris/cmd/zdb/zdb.c +++ cddl/contrib/opensolaris/cmd/zdb/zdb.c @@ -2281,14 +2281,14 @@ object_count++; } - ASSERT3U(object_count, ==, usedobjs); - (void) printf("\n"); if (error != ESRCH) { (void) fprintf(stderr, "dmu_object_next() = %d\n", error); abort(); } + + ASSERT3U(object_count, ==, usedobjs); } static void @@ -2788,6 +2788,7 @@ mutex_enter(&spa->spa_scrub_lock); spa->spa_scrub_inflight--; + spa->spa_load_verify_ios--; cv_broadcast(&spa->spa_scrub_io_cv); if (ioerr && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { @@ -2859,9 +2860,10 @@ flags |= ZIO_FLAG_SPECULATIVE; mutex_enter(&spa->spa_scrub_lock); - while (spa->spa_scrub_inflight > max_inflight) + while (spa->spa_load_verify_ios > max_inflight) cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); spa->spa_scrub_inflight++; + spa->spa_load_verify_ios++; mutex_exit(&spa->spa_scrub_lock); zio_nowait(zio_read(NULL, spa, bp, abd, size, Index: cddl/contrib/opensolaris/cmd/zpool/zpool_main.c =================================================================== --- cddl/contrib/opensolaris/cmd/zpool/zpool_main.c +++ cddl/contrib/opensolaris/cmd/zpool/zpool_main.c @@ -1643,7 +1643,7 @@ (void) nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_SCAN_STATS, (uint64_t **)&ps, &c); - if (ps && ps->pss_state == DSS_SCANNING && + if (ps != NULL && ps->pss_state == DSS_SCANNING && vs->vs_scan_processed != 0 && children == 0) { (void) printf(gettext(" (%s)"), (ps->pss_func == POOL_SCAN_RESILVER) ? @@ -4254,11 +4254,13 @@ print_scan_status(pool_scan_stat_t *ps) { time_t start, end, pause; - uint64_t elapsed, mins_left, hours_left; - uint64_t pass_exam, examined, total; - uint_t rate; + uint64_t total_secs_left; + uint64_t elapsed, secs_left, mins_left, hours_left, days_left; + uint64_t pass_scanned, scanned, pass_issued, issued, total; + uint_t scan_rate, issue_rate; double fraction_done; - char processed_buf[7], examined_buf[7], total_buf[7], rate_buf[7]; + char processed_buf[7], scanned_buf[7], issued_buf[7], total_buf[7]; + char srate_buf[7], irate_buf[7]; (void) printf(gettext(" scan: ")); @@ -4272,30 +4274,37 @@ start = ps->pss_start_time; end = ps->pss_end_time; pause = ps->pss_pass_scrub_pause; + zfs_nicenum(ps->pss_processed, processed_buf, sizeof (processed_buf)); assert(ps->pss_func == POOL_SCAN_SCRUB || ps->pss_func == POOL_SCAN_RESILVER); - /* - * Scan is finished or canceled. - */ - if (ps->pss_state == DSS_FINISHED) { - uint64_t minutes_taken = (end - start) / 60; - char *fmt = NULL; + /* Scan is finished or canceled. */ + if (ps->pss_state == DSS_FINISHED) { + total_secs_left = end - start; + days_left = total_secs_left / 60 / 60 / 24; + hours_left = (total_secs_left / 60 / 60) % 24; + mins_left = (total_secs_left / 60) % 60; + secs_left = (total_secs_left % 60); + if (ps->pss_func == POOL_SCAN_SCRUB) { - fmt = gettext("scrub repaired %s in %lluh%um with " - "%llu errors on %s"); + (void) printf(gettext("scrub repaired %s " + "in %llu days %02llu:%02llu:%02llu " + "with %llu errors on %s"), processed_buf, + (u_longlong_t)days_left, (u_longlong_t)hours_left, + (u_longlong_t)mins_left, (u_longlong_t)secs_left, + (u_longlong_t)ps->pss_errors, ctime(&end)); } else if (ps->pss_func == POOL_SCAN_RESILVER) { - fmt = gettext("resilvered %s in %lluh%um with " - "%llu errors on %s"); + (void) printf(gettext("resilvered %s " + "in %llu days %02llu:%02llu:%02llu " + "with %llu errors on %s"), processed_buf, + (u_longlong_t)days_left, (u_longlong_t)hours_left, + (u_longlong_t)mins_left, (u_longlong_t)secs_left, + (u_longlong_t)ps->pss_errors, ctime(&end)); + } - /* LINTED */ - (void) printf(fmt, processed_buf, - (u_longlong_t)(minutes_taken / 60), - (uint_t)(minutes_taken % 60), - (u_longlong_t)ps->pss_errors, - ctime((time_t *)&end)); + return; } else if (ps->pss_state == DSS_CANCELED) { if (ps->pss_func == POOL_SCAN_SCRUB) { @@ -4310,19 +4319,15 @@ assert(ps->pss_state == DSS_SCANNING); - /* - * Scan is in progress. - */ + /* Scan is in progress. Resilvers can't be paused. */ if (ps->pss_func == POOL_SCAN_SCRUB) { if (pause == 0) { (void) printf(gettext("scrub in progress since %s"), ctime(&start)); } else { - char buf[32]; - struct tm *p = localtime(&pause); - (void) strftime(buf, sizeof (buf), "%a %b %e %T %Y", p); - (void) printf(gettext("scrub paused since %s\n"), buf); - (void) printf(gettext("\tscrub started on %s"), + (void) printf(gettext("scrub paused since %s"), + ctime(&pause)); + (void) printf(gettext("\tscrub started on %s"), ctime(&start)); } } else if (ps->pss_func == POOL_SCAN_RESILVER) { @@ -4330,49 +4335,67 @@ ctime(&start)); } - examined = ps->pss_examined ? ps->pss_examined : 1; + scanned = ps->pss_examined; + pass_scanned = ps->pss_pass_exam; + issued = ps->pss_issued; + pass_issued = ps->pss_pass_issued; total = ps->pss_to_examine; - fraction_done = (double)examined / total; - /* elapsed time for this pass */ + /* we are only done with a block once we have issued the IO for it */ + fraction_done = (double)issued / total; + + /* elapsed time for this pass, rounding up to 1 if it's 0 */ elapsed = time(NULL) - ps->pss_pass_start; elapsed -= ps->pss_pass_scrub_spent_paused; - elapsed = elapsed ? elapsed : 1; - pass_exam = ps->pss_pass_exam ? ps->pss_pass_exam : 1; - rate = pass_exam / elapsed; - rate = rate ? rate : 1; - mins_left = ((total - examined) / rate) / 60; - hours_left = mins_left / 60; + elapsed = (elapsed != 0) ? elapsed : 1; - zfs_nicenum(examined, examined_buf, sizeof (examined_buf)); + scan_rate = pass_scanned / elapsed; + issue_rate = pass_issued / elapsed; + total_secs_left = (issue_rate != 0) ? + ((total - issued) / issue_rate) : UINT64_MAX; + + days_left = total_secs_left / 60 / 60 / 24; + hours_left = (total_secs_left / 60 / 60) % 24; + mins_left = (total_secs_left / 60) % 60; + secs_left = (total_secs_left % 60); + + /* format all of the numbers we will be reporting */ + zfs_nicenum(scanned, scanned_buf, sizeof (scanned_buf)); + zfs_nicenum(issued, issued_buf, sizeof (issued_buf)); zfs_nicenum(total, total_buf, sizeof (total_buf)); + zfs_nicenum(scan_rate, srate_buf, sizeof (srate_buf)); + zfs_nicenum(issue_rate, irate_buf, sizeof (irate_buf)); - /* - * do not print estimated time if hours_left is more than 30 days - * or we have a paused scrub - */ + /* doo not print estimated time if we have a paused scrub */ if (pause == 0) { - zfs_nicenum(rate, rate_buf, sizeof (rate_buf)); - (void) printf(gettext("\t%s scanned out of %s at %s/s"), - examined_buf, total_buf, rate_buf); - if (hours_left < (30 * 24)) { - (void) printf(gettext(", %lluh%um to go\n"), - (u_longlong_t)hours_left, (uint_t)(mins_left % 60)); - } else { - (void) printf(gettext( - ", (scan is slow, no estimated time)\n")); - } + (void) printf(gettext("\t%s scanned at %s/s, " + "%s issued at %s/s, %s total\n"), + scanned_buf, srate_buf, issued_buf, irate_buf, total_buf); } else { - (void) printf(gettext("\t%s scanned out of %s\n"), - examined_buf, total_buf); + (void) printf(gettext("\t%s scanned, %s issued, %s total\n"), + scanned_buf, issued_buf, total_buf); } if (ps->pss_func == POOL_SCAN_RESILVER) { - (void) printf(gettext(" %s resilvered, %.2f%% done\n"), + (void) printf(gettext("\t%s resilvered, %.2f%% done"), processed_buf, 100 * fraction_done); } else if (ps->pss_func == POOL_SCAN_SCRUB) { - (void) printf(gettext(" %s repaired, %.2f%% done\n"), + (void) printf(gettext("\t%s repaired, %.2f%% done"), processed_buf, 100 * fraction_done); + } + + if (pause == 0) { + if (issue_rate >= 10 * 1024 * 1024) { + (void) printf(gettext(", %llu days " + "%02llu:%02llu:%02llu to go\n"), + (u_longlong_t)days_left, (u_longlong_t)hours_left, + (u_longlong_t)mins_left, (u_longlong_t)secs_left); + } else { + (void) printf(gettext(", no estimated " + "completion time\n")); + } + } else { + (void) printf(gettext("\n")); } } Index: cddl/contrib/opensolaris/cmd/ztest/ztest.c =================================================================== --- cddl/contrib/opensolaris/cmd/ztest/ztest.c +++ cddl/contrib/opensolaris/cmd/ztest/ztest.c @@ -374,15 +374,15 @@ { ztest_fzap, 1, &zopt_sometimes }, { ztest_dmu_snapshot_create_destroy, 1, &zopt_sometimes }, { ztest_spa_create_destroy, 1, &zopt_sometimes }, - { ztest_fault_inject, 1, &zopt_sometimes }, + { ztest_fault_inject, 1, &zopt_incessant }, { ztest_ddt_repair, 1, &zopt_sometimes }, { ztest_dmu_snapshot_hold, 1, &zopt_sometimes }, { ztest_reguid, 1, &zopt_rarely }, { ztest_spa_rename, 1, &zopt_rarely }, - { ztest_scrub, 1, &zopt_rarely }, + { ztest_scrub, 1, &zopt_often }, { ztest_spa_upgrade, 1, &zopt_rarely }, { ztest_dsl_dataset_promote_busy, 1, &zopt_rarely }, - { ztest_vdev_attach_detach, 1, &zopt_sometimes }, + { ztest_vdev_attach_detach, 1, &zopt_incessant }, { ztest_vdev_LUN_growth, 1, &zopt_rarely }, { ztest_vdev_add_remove, 1, &ztest_opts.zo_vdevtime }, Index: cddl/contrib/opensolaris/lib/libzfs/common/libzfs_status.c =================================================================== --- cddl/contrib/opensolaris/lib/libzfs/common/libzfs_status.c +++ cddl/contrib/opensolaris/lib/libzfs/common/libzfs_status.c @@ -219,7 +219,7 @@ */ (void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_SCAN_STATS, (uint64_t **)&ps, &psc); - if (ps && ps->pss_func == POOL_SCAN_RESILVER && + if (ps != NULL && ps->pss_func == POOL_SCAN_RESILVER && ps->pss_state == DSS_SCANNING) return (ZPOOL_STATUS_RESILVERING); Index: cddl/contrib/opensolaris/lib/libzpool/common/sys/zfs_context.h =================================================================== --- cddl/contrib/opensolaris/lib/libzpool/common/sys/zfs_context.h +++ cddl/contrib/opensolaris/lib/libzpool/common/sys/zfs_context.h @@ -408,6 +408,7 @@ #define TQ_NOQUEUE 0x02 /* Do not enqueue if can't dispatch */ #define TQ_FRONT 0x08 /* Queue in front */ +#define TASKQID_INVALID ((taskqid_t)0) extern taskq_t *system_taskq; @@ -421,6 +422,7 @@ taskq_ent_t *); extern void taskq_destroy(taskq_t *); extern void taskq_wait(taskq_t *); +extern void taskq_wait_id(taskq_t *, taskqid_t); extern int taskq_member(taskq_t *, void *); extern void system_taskq_init(void); extern void system_taskq_fini(void); Index: cddl/contrib/opensolaris/lib/libzpool/common/taskq.c =================================================================== --- cddl/contrib/opensolaris/lib/libzpool/common/taskq.c +++ cddl/contrib/opensolaris/lib/libzpool/common/taskq.c @@ -187,6 +187,12 @@ mutex_exit(&tq->tq_lock); } +void +taskq_wait_id(taskq_t *tq, taskqid_t id) +{ + taskq_wait(tq); +} + static void * taskq_thread(void *arg) { Index: sys/cddl/compat/opensolaris/kern/opensolaris_taskq.c =================================================================== --- sys/cddl/compat/opensolaris/kern/opensolaris_taskq.c +++ sys/cddl/compat/opensolaris/kern/opensolaris_taskq.c @@ -173,3 +173,9 @@ { taskqueue_drain_all(tq->tq_queue); } + +void +taskq_wait_id(taskq_t *tq, taskqid_t id) +{ + taskq_wait(tq); +} Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c =================================================================== --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c @@ -339,7 +339,8 @@ * minimum lifespan of a prefetch block in clock ticks * (initialized in arc_init()) */ -static int arc_min_prefetch_lifespan; +static int zfs_arc_min_prefetch_ms = 1; +static int zfs_arc_min_prescient_prefetch_ms = 6; /* * If this percent of memory is free, don't throttle. @@ -779,8 +780,9 @@ kstat_named_t arcstat_meta_limit; kstat_named_t arcstat_meta_max; kstat_named_t arcstat_meta_min; - kstat_named_t arcstat_sync_wait_for_async; + kstat_named_t arcstat_async_upgrade_sync; kstat_named_t arcstat_demand_hit_predictive_prefetch; + kstat_named_t arcstat_demand_hit_prescient_prefetch; } arc_stats_t; static arc_stats_t arc_stats = { @@ -877,8 +879,9 @@ { "arc_meta_limit", KSTAT_DATA_UINT64 }, { "arc_meta_max", KSTAT_DATA_UINT64 }, { "arc_meta_min", KSTAT_DATA_UINT64 }, - { "sync_wait_for_async", KSTAT_DATA_UINT64 }, + { "async_upgrade_sync", KSTAT_DATA_UINT64 }, { "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 }, + { "demand_hit_prescient_prefetch", KSTAT_DATA_UINT64 }, }; #define ARCSTAT(stat) (arc_stats.stat.value.ui64) @@ -974,22 +977,23 @@ struct arc_callback { void *acb_private; - arc_done_func_t *acb_done; + arc_read_done_func_t *acb_done; arc_buf_t *acb_buf; boolean_t acb_compressed; zio_t *acb_zio_dummy; + zio_t *acb_zio_head; arc_callback_t *acb_next; }; typedef struct arc_write_callback arc_write_callback_t; struct arc_write_callback { - void *awcb_private; - arc_done_func_t *awcb_ready; - arc_done_func_t *awcb_children_ready; - arc_done_func_t *awcb_physdone; - arc_done_func_t *awcb_done; - arc_buf_t *awcb_buf; + void *awcb_private; + arc_write_done_func_t *awcb_ready; + arc_write_done_func_t *awcb_children_ready; + arc_write_done_func_t *awcb_physdone; + arc_write_done_func_t *awcb_done; + arc_buf_t *awcb_buf; }; /* @@ -1229,6 +1233,8 @@ #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) #define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_FLAG_IO_ERROR) #define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_FLAG_PREFETCH) +#define HDR_PRESCIENT_PREFETCH(hdr) \ + ((hdr)->b_flags & ARC_FLAG_PRESCIENT_PREFETCH) #define HDR_COMPRESSION_ENABLED(hdr) \ ((hdr)->b_flags & ARC_FLAG_COMPRESSED_ARC) @@ -1392,6 +1398,11 @@ SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD, &ARC_l2c_only.arcs_size.rc_count, 0, "size of mru state"); +SYSCTL_UINT(_vfs_zfs, OID_AUTO, arc_min_prfetch_ms, CTLFLAG_RW, + &zfs_arc_min_prefetch_ms, 0, "Min life of prefetch block in ms"); +SYSCTL_UINT(_vfs_zfs, OID_AUTO, arc_min_prescient_prefetch_ms, CTLFLAG_RW, + &zfs_arc_min_prescient_prefetch_ms, 0, "Min life oof prescient prefetched block in ms"); + /* * L2ARC Internals */ @@ -3544,6 +3555,8 @@ { arc_state_t *evicted_state, *state; int64_t bytes_evicted = 0; + int min_lifetime = HDR_PRESCIENT_PREFETCH(hdr) ? + zfs_arc_min_prescient_prefetch_ms : zfs_arc_min_prefetch_ms; ASSERT(MUTEX_HELD(hash_lock)); ASSERT(HDR_HAS_L1HDR(hdr)); @@ -3596,8 +3609,7 @@ /* prefetch buffers have a minimum lifespan */ if (HDR_IO_IN_PROGRESS(hdr) || ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) && - ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access < - arc_min_prefetch_lifespan)) { + ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access < min_lifetime * hz)) { ARCSTAT_BUMP(arcstat_evict_skip); return (bytes_evicted); } @@ -4968,13 +4980,15 @@ * - move the buffer to the head of the list if this is * another prefetch (to make it less likely to be evicted). */ - if (HDR_PREFETCH(hdr)) { + if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) { if (refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { /* link protected by hash lock */ ASSERT(multilist_link_active( &hdr->b_l1hdr.b_arc_node)); } else { - arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH); + arc_hdr_clear_flags(hdr, + ARC_FLAG_PREFETCH | + ARC_FLAG_PRESCIENT_PREFETCH); ARCSTAT_BUMP(arcstat_mru_hits); } hdr->b_l1hdr.b_arc_access = now; @@ -5005,10 +5019,13 @@ * MFU state. */ - if (HDR_PREFETCH(hdr)) { + if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) { new_state = arc_mru; - if (refcount_count(&hdr->b_l1hdr.b_refcnt) > 0) - arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH); + if (refcount_count(&hdr->b_l1hdr.b_refcnt) > 0) { + arc_hdr_clear_flags(hdr, + ARC_FLAG_PREFETCH | + ARC_FLAG_PRESCIENT_PREFETCH); + } DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); } else { new_state = arc_mfu; @@ -5029,11 +5046,7 @@ * If it was a prefetch, we will explicitly move it to * the head of the list now. */ - if ((HDR_PREFETCH(hdr)) != 0) { - ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); - /* link protected by hash_lock */ - ASSERT(multilist_link_active(&hdr->b_l1hdr.b_arc_node)); - } + ARCSTAT_BUMP(arcstat_mfu_hits); hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); } else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) { @@ -5044,12 +5057,11 @@ * MFU state. */ - if (HDR_PREFETCH(hdr)) { + if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) { /* * This is a prefetch access... * move this block back to the MRU state. */ - ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt)); new_state = arc_mru; } @@ -5116,23 +5128,28 @@ demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data, metadata, hits); } -/* a generic arc_done_func_t which you can use */ +/* a generic arc_read_done_func_t which you can use */ /* ARGSUSED */ void -arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg) +arc_bcopy_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, + arc_buf_t *buf, void *arg) { - if (zio == NULL || zio->io_error == 0) - bcopy(buf->b_data, arg, arc_buf_size(buf)); + if (buf == NULL) + return; + + bcopy(buf->b_data, arg, arc_buf_size(buf)); arc_buf_destroy(buf, arg); } -/* a generic arc_done_func_t */ +/* a generic arc_read_done_func_t */ +/* ARGSUSED */ void -arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg) +arc_getbuf_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, + arc_buf_t *buf, void *arg) { arc_buf_t **bufp = arg; - if (zio && zio->io_error) { - arc_buf_destroy(buf, arg); + + if (buf == NULL) { *bufp = NULL; } else { *bufp = buf; @@ -5164,7 +5181,6 @@ arc_callback_t *callback_list; arc_callback_t *acb; boolean_t freeable = B_FALSE; - boolean_t no_zio_error = (zio->io_error == 0); /* * The hdr was inserted into hash-table and removed from lists @@ -5190,7 +5206,7 @@ ASSERT3P(hash_lock, !=, NULL); } - if (no_zio_error) { + if (zio->io_error == 0) { /* byteswap if necessary */ if (BP_SHOULD_BYTESWAP(zio->io_bp)) { if (BP_GET_LEVEL(zio->io_bp) > 0) { @@ -5211,7 +5227,8 @@ callback_list = hdr->b_l1hdr.b_acb; ASSERT3P(callback_list, !=, NULL); - if (hash_lock && no_zio_error && hdr->b_l1hdr.b_state == arc_anon) { + if (hash_lock && zio->io_error == 0 && + hdr->b_l1hdr.b_state == arc_anon) { /* * Only call arc_access on anonymous buffers. This is because * if we've issued an I/O for an evicted buffer, we've already @@ -5232,14 +5249,21 @@ if (!acb->acb_done) continue; - /* This is a demand read since prefetches don't use callbacks */ callback_cnt++; + if (zio->io_error != 0) + continue; + int error = arc_buf_alloc_impl(hdr, acb->acb_private, - acb->acb_compressed, no_zio_error, &acb->acb_buf); - if (no_zio_error) { - zio->io_error = error; + acb->acb_compressed, + B_TRUE, &acb->acb_buf); + if (error != 0) { + arc_buf_destroy(acb->acb_buf, acb->acb_private); + acb->acb_buf = NULL; } + + if (zio->io_error == 0) + zio->io_error = error; } hdr->b_l1hdr.b_acb = NULL; arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); @@ -5252,7 +5276,7 @@ ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt) || callback_list != NULL); - if (no_zio_error) { + if (zio->io_error == 0) { arc_hdr_verify(hdr, zio->io_bp); } else { arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR); @@ -5285,8 +5309,10 @@ /* execute each callback and free its structure */ while ((acb = callback_list) != NULL) { - if (acb->acb_done) - acb->acb_done(zio, acb->acb_buf, acb->acb_private); + if (acb->acb_done) { + acb->acb_done(zio, &zio->io_bookmark, zio->io_bp, + acb->acb_buf, acb->acb_private); + } if (acb->acb_zio_dummy != NULL) { acb->acb_zio_dummy->io_error = zio->io_error; @@ -5320,7 +5346,7 @@ * for readers of this block. */ int -arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, +arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_read_done_func_t *done, void *private, zio_priority_t priority, int zio_flags, arc_flags_t *arc_flags, const zbookmark_phys_t *zb) { @@ -5329,7 +5355,8 @@ zio_t *rzio; uint64_t guid = spa_load_guid(spa); boolean_t compressed_read = (zio_flags & ZIO_FLAG_RAW) != 0; - + int rc = 0; + ASSERT(!BP_IS_EMBEDDED(bp) || BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA); @@ -5347,32 +5374,20 @@ *arc_flags |= ARC_FLAG_CACHED; if (HDR_IO_IN_PROGRESS(hdr)) { + zio_t *head_zio = hdr->b_l1hdr.b_acb->acb_zio_head; + ASSERT3P(head_zio, !=, NULL); if ((hdr->b_flags & ARC_FLAG_PRIO_ASYNC_READ) && priority == ZIO_PRIORITY_SYNC_READ) { /* - * This sync read must wait for an - * in-progress async read (e.g. a predictive - * prefetch). Async reads are queued - * separately at the vdev_queue layer, so - * this is a form of priority inversion. - * Ideally, we would "inherit" the demand - * i/o's priority by moving the i/o from - * the async queue to the synchronous queue, - * but there is currently no mechanism to do - * so. Track this so that we can evaluate - * the magnitude of this potential performance - * problem. - * - * Note that if the prefetch i/o is already - * active (has been issued to the device), - * the prefetch improved performance, because - * we issued it sooner than we would have - * without the prefetch. + * This is a sync read that needs to wait for + * an in-flight async read. Request that the + * zio have its priority upgraded. */ - DTRACE_PROBE1(arc__sync__wait__for__async, + zio_change_priority(head_zio, priority); + DTRACE_PROBE1(arc__async__upgrade__sync, arc_buf_hdr_t *, hdr); - ARCSTAT_BUMP(arcstat_sync_wait_for_async); + ARCSTAT_BUMP(arcstat_async_upgrade_sync); } if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) { arc_hdr_clear_flags(hdr, @@ -5399,6 +5414,7 @@ spa, NULL, NULL, NULL, zio_flags); ASSERT3P(acb->acb_done, !=, NULL); + acb->acb_zio_head = head_zio; acb->acb_next = hdr->b_l1hdr.b_acb; hdr->b_l1hdr.b_acb = acb; mutex_exit(hash_lock); @@ -5426,17 +5442,32 @@ arc_hdr_clear_flags(hdr, ARC_FLAG_PREDICTIVE_PREFETCH); } - ASSERT(!BP_IS_EMBEDDED(bp) || !BP_IS_HOLE(bp)); + if (hdr->b_flags & ARC_FLAG_PRESCIENT_PREFETCH) { + ARCSTAT_BUMP( + arcstat_demand_hit_prescient_prefetch); + arc_hdr_clear_flags(hdr, + ARC_FLAG_PRESCIENT_PREFETCH); + } + + ASSERT(!BP_IS_EMBEDDED(bp) || !BP_IS_HOLE(bp)); /* Get a buf with the desired data in it. */ - VERIFY0(arc_buf_alloc_impl(hdr, private, - compressed_read, B_TRUE, &buf)); + rc = arc_buf_alloc_impl(hdr, private, + compressed_read, B_TRUE, &buf); + if (rc != 0) { + arc_buf_destroy(buf, private); + buf = NULL; + } + ASSERT((zio_flags & ZIO_FLAG_SPECULATIVE) || + rc == 0 || rc != ENOENT); } else if (*arc_flags & ARC_FLAG_PREFETCH && refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); } DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); arc_access(hdr, hash_lock); + if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH) + arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH); if (*arc_flags & ARC_FLAG_L2CACHE) arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); mutex_exit(hash_lock); @@ -5446,7 +5477,7 @@ data, metadata, hits); if (done) - done(NULL, buf, private); + done(NULL, zb, bp, buf, private); } else { uint64_t lsize = BP_GET_LSIZE(bp); uint64_t psize = BP_GET_PSIZE(bp); @@ -5520,6 +5551,9 @@ if (*arc_flags & ARC_FLAG_PREFETCH) arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); + if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH) + arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH); + if (*arc_flags & ARC_FLAG_L2CACHE) arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); if (BP_GET_LEVEL(bp) > 0) @@ -5549,14 +5583,17 @@ vd = NULL; } - if (priority == ZIO_PRIORITY_ASYNC_READ) + /* + * We count both async reads and scrub IOs as asynchronous so + * that both can be upgraded in the event of a cache hit while + * the read IO is still in-flight. + */ + if (priority == ZIO_PRIORITY_ASYNC_READ || + priority == ZIO_PRIORITY_SCRUB) arc_hdr_set_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ); else arc_hdr_clear_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ); - if (hash_lock != NULL) - mutex_exit(hash_lock); - /* * At this point, we have a level 1 cache miss. Try again in * L2ARC if possible. @@ -5637,6 +5674,11 @@ ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE); + acb->acb_zio_head = rzio; + + if (hash_lock != NULL) + mutex_exit(hash_lock); + DTRACE_PROBE2(l2arc__read, vdev_t *, vd, zio_t *, rzio); ARCSTAT_INCR(arcstat_l2_read_bytes, size); @@ -5651,6 +5693,8 @@ return (0); /* l2arc read error; goto zio_read() */ + if (hash_lock != NULL) + mutex_enter(hash_lock); } else { DTRACE_PROBE1(l2arc__miss, arc_buf_hdr_t *, hdr); @@ -5671,7 +5715,11 @@ rzio = zio_read(pio, spa, bp, hdr->b_l1hdr.b_pabd, size, arc_read_done, hdr, priority, zio_flags, zb); + acb->acb_zio_head = rzio; + if (hash_lock != NULL) + mutex_exit(hash_lock); + if (*arc_flags & ARC_FLAG_WAIT) return (zio_wait(rzio)); @@ -6162,9 +6210,9 @@ zio_t * arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_t *buf, - boolean_t l2arc, const zio_prop_t *zp, arc_done_func_t *ready, - arc_done_func_t *children_ready, arc_done_func_t *physdone, - arc_done_func_t *done, void *private, zio_priority_t priority, + boolean_t l2arc, const zio_prop_t *zp, arc_write_done_func_t *ready, + arc_write_done_func_t *children_ready, arc_write_done_func_t *physdone, + arc_write_done_func_t *done, void *private, zio_priority_t priority, int zio_flags, const zbookmark_phys_t *zb) { arc_buf_hdr_t *hdr = buf->b_hdr; @@ -6590,9 +6638,6 @@ mutex_init(&arc_dnlc_evicts_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&arc_dnlc_evicts_cv, NULL, CV_DEFAULT, NULL); - - /* Convert seconds to clock ticks */ - arc_min_prefetch_lifespan = 1 * hz; /* set min cache to 1/32 of all memory, or arc_abs_min, whichever is more */ arc_c_min = MAX(allmem / 32, arc_abs_min); Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c =================================================================== --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c @@ -902,7 +902,8 @@ } static void -dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb) +dbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, + arc_buf_t *buf, void *vdb) { dmu_buf_impl_t *db = vdb; @@ -916,19 +917,22 @@ ASSERT(db->db.db_data == NULL); if (db->db_level == 0 && db->db_freed_in_flight) { /* we were freed in flight; disregard any error */ + if (buf == NULL) { + buf = arc_alloc_buf(db->db_objset->os_spa, + db, DBUF_GET_BUFC_TYPE(db), db->db.db_size); + } arc_release(buf, db); bzero(buf->b_data, db->db.db_size); arc_buf_freeze(buf); db->db_freed_in_flight = FALSE; dbuf_set_data(db, buf); db->db_state = DB_CACHED; - } else if (zio == NULL || zio->io_error == 0) { + } else if (buf != NULL) { dbuf_set_data(db, buf); db->db_state = DB_CACHED; } else { ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT3P(db->db_buf, ==, NULL); - arc_buf_destroy(buf, db); db->db_state = DB_UNCACHED; } cv_broadcast(&db->db_changed); @@ -2326,7 +2330,8 @@ * prefetch if the next block down is our target. */ static void -dbuf_prefetch_indirect_done(zio_t *zio, arc_buf_t *abuf, void *private) +dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb, + const blkptr_t *iobp, arc_buf_t *abuf, void *private) { dbuf_prefetch_arg_t *dpa = private; @@ -2365,13 +2370,18 @@ dbuf_rele(db, FTAG); } + if (abuf == NULL) { + kmem_free(dpa, sizeof(*dpa)); + return; + } + dpa->dpa_curlevel--; uint64_t nextblkid = dpa->dpa_zb.zb_blkid >> (dpa->dpa_epbs * (dpa->dpa_curlevel - dpa->dpa_zb.zb_level)); blkptr_t *bp = ((blkptr_t *)abuf->b_data) + P2PHASE(nextblkid, 1ULL << dpa->dpa_epbs); - if (BP_IS_HOLE(bp) || (zio != NULL && zio->io_error != 0)) { + if (BP_IS_HOLE(bp)) { kmem_free(dpa, sizeof (*dpa)); } else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) { ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid); @@ -3746,7 +3756,7 @@ * ready callback so that we can properly handle an indirect * block that only contains holes. */ - arc_done_func_t *children_ready_cb = NULL; + arc_write_done_func_t *children_ready_cb = NULL; if (db->db_level != 0) children_ready_cb = dbuf_write_children_ready; Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt.c =================================================================== --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt.c +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt.c @@ -1112,14 +1112,26 @@ void ddt_sync(spa_t *spa, uint64_t txg) { + dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; dmu_tx_t *tx; - zio_t *rio = zio_root(spa, NULL, NULL, - ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SELF_HEAL); + zio_t *rio; ASSERT(spa_syncing_txg(spa) == txg); tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); + rio = zio_root(spa, NULL, NULL, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SELF_HEAL); + + /* + * This function may cause an immediate scan of ddt blocks (see + * the comment above dsl_scan_ddt() for details). We set the + * scan's root zio here so that we can wait for any scan IOs in + * addition to the regular ddt IOs. + */ + ASSERT3P(scn->scn_zio_root, ==, NULL); + scn->scn_zio_root = rio; + for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { ddt_t *ddt = spa->spa_ddt[c]; if (ddt == NULL) @@ -1129,6 +1141,7 @@ } (void) zio_wait(rio); + scn->scn_zio_root = NULL; dmu_tx_commit(tx); } Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c =================================================================== --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c @@ -349,6 +349,7 @@ ASSERT(ds == NULL || MUTEX_HELD(&ds->ds_opening_lock)); +#if 0 /* * The $ORIGIN dataset (if it exists) doesn't have an associated * objset, so there's no reason to open it. The $ORIGIN dataset @@ -359,6 +360,7 @@ ASSERT3P(ds->ds_dir, !=, spa_get_dsl(spa)->dp_origin_snap->ds_dir); } +#endif os = kmem_zalloc(sizeof (objset_t), KM_SLEEP); os->os_dsl_dataset = ds; Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c =================================================================== --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c @@ -499,8 +499,9 @@ const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) { prefetch_data_t *pfd = arg; - arc_flags_t aflags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH; - + arc_flags_t aflags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH | + ARC_FLAG_PRESCIENT_PREFETCH; + ASSERT(pfd->pd_bytes_fetched >= 0); if (bp == NULL) return (0); Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c =================================================================== --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c @@ -51,28 +51,136 @@ #include #include #include +#include #ifdef _KERNEL #include #endif +/* + * Grand theory statement on scan queue sorting + * + * Scanning is implemented by recursively traversing all indirection levels + * in an object and reading all blocks referenced from said objects. This + * results in us approximately traversing the object from lowest logical + * offset to the highest. For best performance, we would want the logical + * blocks to be physically contiguous. However, this is frequently not the + * case with pools given the allocation patterns of copy-on-write filesystems. + * So instead, we put the I/Os into a reordering queue and issue them in a + * way that will most benefit physical disks (LBA-order). + * + * Queue management: + * + * Ideally, we would want to scan all metadata and queue up all block I/O + * prior to starting to issue it, because that allows us to do an optimal + * sorting job. This can however consume large amounts of memory. Therefore + * we continuously monitor the size of the queues and constrain them to 5% + * (zfs_scan_mem_lim_fact) of physmem. If the queues grow larger than this + * limit, we clear out a few of the largest extents at the head of the queues + * to make room for more scanning. Hopefully, these extents will be fairly + * large and contiguous, allowing us to approach sequential I/O throughput + * even without a fully sorted tree. + * + * Metadata scanning takes place in dsl_scan_visit(), which is called from + * dsl_scan_sync() every spa_sync(). If we have either fully scanned all + * metadata on the pool, or we need to make room in memory because our + * queues are too large, dsl_scan_visit() is postponed and + * scan_io_queues_run() is called from dsl_scan_sync() instead. This implies + * that metadata scanning and queued I/O issuing are mutually exclusive. This + * allows us to provide maximum sequential I/O throughput for the majority of + * I/O's issued since sequential I/O performance is significantly negatively + * impacted if it is interleaved with random I/O. + * + * Implementation Notes + * + * One side effect of the queued scanning algorithm is that the scanning code + * needs to be notified whenever a block is freed. This is needed to allow + * the scanning code to remove these I/Os from the issuing queue. Additionally, + * we do not attempt to queue gang blocks to be issued sequentially since this + * is very hard to do and would have an extremely limitted performance benefit. + * Instead, we simply issue gang I/Os as soon as we find them using the legacy + * algorithm. + * + * Backwards compatibility + * + * This new algorithm is backwards compatible with the legacy on-disk data + * structures (and therefore does not require a new feature flag). + * Periodically during scanning (see zfs_scan_checkpoint_intval), the scan + * will stop scanning metadata (in logical order) and wait for all outstanding + * sorted I/O to complete. Once this is done, we write out a checkpoint + * bookmark, indicating that we have scanned everything logically before it. + * If the pool is imported on a machine without the new sorting algorithm, + * the scan simply resumes from the last checkpoint using the legacy algorithm. + */ + typedef int (scan_cb_t)(dsl_pool_t *, const blkptr_t *, const zbookmark_phys_t *); static scan_cb_t dsl_scan_scrub_cb; -static void dsl_scan_cancel_sync(void *, dmu_tx_t *); -static void dsl_scan_sync_state(dsl_scan_t *, dmu_tx_t *); -static boolean_t dsl_scan_restarting(dsl_scan_t *, dmu_tx_t *); -unsigned int zfs_top_maxinflight = 32; /* maximum I/Os per top-level */ -unsigned int zfs_resilver_delay = 2; /* number of ticks to delay resilver */ -unsigned int zfs_scrub_delay = 4; /* number of ticks to delay scrub */ +static int scan_ds_queue_compare(const void *a, const void *b); +static int scan_prefetch_queue_compare(const void *a, const void *b); +static void scan_ds_queue_clear(dsl_scan_t *scn); +static boolean_t scan_ds_queue_contains(dsl_scan_t *scn, uint64_t dsobj, + uint64_t *txg); +static void scan_ds_queue_insert(dsl_scan_t *scn, uint64_t dsobj, uint64_t txg); +static void scan_ds_queue_remove(dsl_scan_t *scn, uint64_t dsobj); +static void scan_ds_queue_sync(dsl_scan_t *scn, dmu_tx_t *tx); + +extern int zfs_vdev_async_write_active_min_dirty_percent; + +/* + * By default zfs will check to ensure it is not over the hard memory + * limit before each txg. If finer-grained control of this is needed + * this value can be set to 1 to enable checking before scanning each + * block. + */ +int zfs_scan_strict_mem_lim = B_FALSE; + +/* + * Maximum number of parallelly executing I/Os per top-level vdev. + * Tune with care. Very high settings (hundreds) are known to trigger + * some firmware bugs and resets on certain SSDs. + */ +int zfs_top_maxinflight = 32; /* maximum I/Os per top-level */ +unsigned int zfs_resilver_delay = 2; /* number of ticks to delay resilver -- 2 is a good number */ +unsigned int zfs_scrub_delay = 4; /* number of ticks to delay scrub -- 4 is a good number */ unsigned int zfs_scan_idle = 50; /* idle window in clock ticks */ -unsigned int zfs_scan_min_time_ms = 1000; /* min millisecs to scrub per txg */ +/* + * Maximum number of parallelly executed bytes per leaf vdev. We attempt + * to strike a balance here between keeping the vdev queues full of I/Os + * at all times and not overflowing the queues to cause long latency, + * which would cause long txg sync times. No matter what, we will not + * overload the drives with I/O, since that is protected by + * zfs_vdev_scrub_max_active. + */ +unsigned long zfs_scan_vdev_limit = 4 << 20; + +int zfs_scan_issue_strategy = 0; +int zfs_scan_legacy = B_FALSE; /* don't queue & sort zios, go direct */ +uint64_t zfs_scan_max_ext_gap = 2 << 20; /* in bytes */ + +unsigned int zfs_scan_checkpoint_intval = 7200; /* seconds */ +#define ZFS_SCAN_CHECKPOINT_INTVAL SEC_TO_TICK(zfs_scan_checkpoint_intval) + +/* + * fill_weight is non-tunable at runtime, so we copy it at module init from + * zfs_scan_fill_weight. Runtime adjustments to zfs_scan_fill_weight would + * break queue sorting. + */ +uint64_t zfs_scan_fill_weight = 3; +static uint64_t fill_weight; + +/* See dsl_scan_should_clear() for details on the memory limit tunables */ +uint64_t zfs_scan_mem_lim_min = 16 << 20; /* bytes */ +uint64_t zfs_scan_mem_lim_soft_max = 128 << 20; /* bytes */ +int zfs_scan_mem_lim_fact = 20; /* fraction of physmem */ +int zfs_scan_mem_lim_soft_fact = 20; /* fraction of mem lim above */ + +unsigned int zfs_scrub_min_time_ms = 1000; /* min millisecs to scrub per txg */ unsigned int zfs_free_min_time_ms = 1000; /* min millisecs to free per txg */ unsigned int zfs_obsolete_min_time_ms = 500; /* min millisecs to obsolete per txg */ -unsigned int zfs_resilver_min_time_ms = 3000; /* min millisecs to resilver - per txg */ +unsigned int zfs_resilver_min_time_ms = 3000; /* min millisecs to resilver per txg */ boolean_t zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */ boolean_t zfs_no_scrub_prefetch = B_FALSE; /* set to disable scrub prefetch */ @@ -86,7 +194,7 @@ SYSCTL_UINT(_vfs_zfs, OID_AUTO, scan_idle, CTLFLAG_RWTUN, &zfs_scan_idle, 0, "Idle scan window in clock ticks"); SYSCTL_UINT(_vfs_zfs, OID_AUTO, scan_min_time_ms, CTLFLAG_RWTUN, - &zfs_scan_min_time_ms, 0, "Min millisecs to scrub per txg"); + &zfs_scrub_min_time_ms, 0, "Min millisecs to scrub per txg"); SYSCTL_UINT(_vfs_zfs, OID_AUTO, free_min_time_ms, CTLFLAG_RWTUN, &zfs_free_min_time_ms, 0, "Min millisecs to free per txg"); SYSCTL_UINT(_vfs_zfs, OID_AUTO, resilver_min_time_ms, CTLFLAG_RWTUN, @@ -95,6 +203,10 @@ &zfs_no_scrub_io, 0, "Disable scrub I/O"); SYSCTL_INT(_vfs_zfs, OID_AUTO, no_scrub_prefetch, CTLFLAG_RWTUN, &zfs_no_scrub_prefetch, 0, "Disable scrub prefetching"); +SYSCTL_UINT(_vfs_zfs, OID_AUTO, zfs_scan_legacy, CTLFLAG_RWTUN, + &zfs_scan_legacy, 0, "Scrub using legacy non-sequential method"); +SYSCTL_UINT(_vfs_zfs, OID_AUTO, zfs_scan_checkpoint_interval, CTLFLAG_RWTUN, + &zfs_scan_checkpoint_intval, 0, "Scan progress on-disk checkpointing interval"); enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE; /* max number of blocks to free in a single TXG */ @@ -102,7 +214,19 @@ SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, free_max_blocks, CTLFLAG_RWTUN, &zfs_async_block_max_blocks, 0, "Maximum number of blocks to free in one TXG"); +/* + * We wait a few txgs after importing a pool to begin scanning so that + * the import / mounting code isn't held up by scrub / resilver IO. + * Unfortunately, it is a bit difficult to determine exactly how long + * this will take since userspace will trigger fs mounts asynchronously + * and the kernel will create zvol minors asynchronously. As a result, + * the value provided here is a bit arbitrary, but represents a + * reasonable estimate of how many txgs it will take to finish fully + * importing a pool + */ +#define SCAN_IMPORT_WAIT_TXGS 5 + #define DSL_SCAN_IS_SCRUB_RESILVER(scn) \ ((scn)->scn_phys.scn_func == POOL_SCAN_SCRUB || \ (scn)->scn_phys.scn_func == POOL_SCAN_RESILVER) @@ -124,6 +248,177 @@ dsl_scan_scrub_cb, /* POOL_SCAN_RESILVER */ }; +/* In core node for the scn->scn_queue. Represents a dataset to be scanned */ +typedef struct { + uint64_t sds_dsobj; + uint64_t sds_txg; + avl_node_t sds_node; +} scan_ds_t; + +/* + * This controls what conditions are placed on dsl_scan_sync_state(): + * SYNC_OPTIONAL) write out scn_phys iff scn_bytes_pending == 0 + * SYNC_MANDATORY) write out scn_phys always. scn_bytes_pending must be 0. + * SYNC_CACHED) if scn_bytes_pending == 0, write out scn_phys. Otherwise + * write out the scn_phys_cached version. + * See dsl_scan_sync_state for details. + */ +typedef enum { + SYNC_OPTIONAL, + SYNC_MANDATORY, + SYNC_CACHED +} state_sync_type_t; + +/* + * This struct represents the minimum information needed to reconstruct a + * zio for sequential scanning. This is useful because many of these will + * accumulate in the sequential IO queues before being issued, so saving + * memory matters here. + */ +typedef struct scan_io { + /* fields from blkptr_t */ + uint64_t sio_offset; + uint64_t sio_blk_prop; + uint64_t sio_phys_birth; + uint64_t sio_birth; + zio_cksum_t sio_cksum; + uint32_t sio_asize; + + /* fields from zio_t */ + int sio_flags; + zbookmark_phys_t sio_zb; + + /* members for queue sorting */ + union { + avl_node_t sio_addr_node; /* link into issueing queue */ + list_node_t sio_list_node; /* link for issuing to disk */ + } sio_nodes; +} scan_io_t; + +struct dsl_scan_io_queue { + dsl_scan_t *q_scn; /* associated dsl_scan_t */ + vdev_t *q_vd; /* top-level vdev that this queue represents */ + + /* trees used for sorting I/Os and extents of I/Os */ + range_tree_t *q_exts_by_addr; + avl_tree_t q_exts_by_size; + avl_tree_t q_sios_by_addr; + + /* members for zio rate limiting */ + uint64_t q_maxinflight_bytes; + uint64_t q_inflight_bytes; + kcondvar_t q_zio_cv; /* used under vd->vdev_scan_io_queue_lock */ + + /* per txg statistics */ + uint64_t q_total_seg_size_this_txg; + uint64_t q_segs_this_txg; + uint64_t q_total_zio_size_this_txg; + uint64_t q_zios_this_txg; +}; + +/* private data for dsl_scan_prefetch_cb() */ +typedef struct scan_prefetch_ctx { + refcount_t spc_refcnt; /* refcount for memory management */ + dsl_scan_t *spc_scn; /* dsl_scan_t for the pool */ + boolean_t spc_root; /* is this prefetch for an objset? */ + uint8_t spc_indblkshift; /* dn_indblkshift of current dnode */ + uint16_t spc_datablkszsec; /* dn_idatablkszsec of current dnode */ +} scan_prefetch_ctx_t; + +/* private data for dsl_scan_prefetch() */ +typedef struct scan_prefetch_issue_ctx { + avl_node_t spic_avl_node; /* link into scn->scn_prefetch_queue */ + scan_prefetch_ctx_t *spic_spc; /* spc for the callback */ + blkptr_t spic_bp; /* bp to prefetch */ + zbookmark_phys_t spic_zb; /* bookmark to prefetch */ +} scan_prefetch_issue_ctx_t; + +static void scan_exec_io(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags, + const zbookmark_phys_t *zb, dsl_scan_io_queue_t *queue); +static void scan_io_queue_insert_impl(dsl_scan_io_queue_t *queue, + scan_io_t *sio); + +static dsl_scan_io_queue_t *scan_io_queue_create(vdev_t *vd); +static void scan_io_queues_destroy(dsl_scan_t *scn); + +static kmem_cache_t *sio_cache; + +void +scan_init(void) +{ + /* + * This is used in ext_size_compare() to weight segments + * based on how sparse they are. This cannot be changed + * mid-scan and the tree comparison functions don't currently + * have a mechansim for passing additional context to the + * compare functions. Thus we store this value globally and + * we only allow it to be set at module intiailization time + */ + fill_weight = zfs_scan_fill_weight; + + sio_cache = kmem_cache_create("sio_cache", + sizeof (scan_io_t), 0, NULL, NULL, NULL, NULL, NULL, 0); +} + +void +scan_fini(void) +{ + kmem_cache_destroy(sio_cache); +} + +static inline boolean_t +dsl_scan_is_running(const dsl_scan_t *scn) +{ + return (scn->scn_phys.scn_state == DSS_SCANNING); +} + +boolean_t +dsl_scan_resilvering(dsl_pool_t *dp) +{ + return (dsl_scan_is_running(dp->dp_scan) && + dp->dp_scan->scn_phys.scn_func == POOL_SCAN_RESILVER); +} + +static inline void +sio2bp(const scan_io_t *sio, blkptr_t *bp, uint64_t vdev_id) +{ + bzero(bp, sizeof (*bp)); + DVA_SET_ASIZE(&bp->blk_dva[0], sio->sio_asize); + DVA_SET_VDEV(&bp->blk_dva[0], vdev_id); + DVA_SET_OFFSET(&bp->blk_dva[0], sio->sio_offset); + bp->blk_prop = sio->sio_blk_prop; + bp->blk_phys_birth = sio->sio_phys_birth; + bp->blk_birth = sio->sio_birth; + bp->blk_fill = 1; /* we always only work with data pointers */ + bp->blk_cksum = sio->sio_cksum; +} + +static inline void +bp2sio(const blkptr_t *bp, scan_io_t *sio, int dva_i) +{ + /* we discard the vdev id, since we can deduce it from the queue */ + sio->sio_offset = DVA_GET_OFFSET(&bp->blk_dva[dva_i]); + sio->sio_asize = DVA_GET_ASIZE(&bp->blk_dva[dva_i]); + sio->sio_blk_prop = bp->blk_prop; + sio->sio_phys_birth = bp->blk_phys_birth; + sio->sio_birth = bp->blk_birth; + sio->sio_cksum = bp->blk_cksum; +} + +void +dsl_scan_global_init(void) +{ + /* + * This is used in ext_size_compare() to weight segments + * based on how sparse they are. This cannot be changed + * mid-scan and the tree comparison functions don't currently + * have a mechansim for passing additional context to the + * compare functions. Thus we store this value globally and + * we only allow it to be set at module intiailization time + */ + fill_weight = zfs_scan_fill_weight; +} + int dsl_scan_init(dsl_pool_t *dp, uint64_t txg) { @@ -144,6 +439,13 @@ scn->scn_async_destroying = spa_feature_is_active(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY); + bcopy(&scn->scn_phys, &scn->scn_phys_cached, sizeof (scn->scn_phys)); + avl_create(&scn->scn_queue, scan_ds_queue_compare, sizeof (scan_ds_t), + offsetof(scan_ds_t, sds_node)); + avl_create(&scn->scn_prefetch_queue, scan_prefetch_queue_compare, + sizeof (scan_prefetch_issue_ctx_t), + offsetof(scan_prefetch_issue_ctx_t, spic_avl_node)); + err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, "scrub_func", sizeof (uint64_t), 1, &f); if (err == 0) { @@ -154,7 +456,7 @@ scn->scn_restart_txg = txg; zfs_dbgmsg("old-style scrub was in progress; " "restarting new-style scrub in txg %llu", - scn->scn_restart_txg); + (longlong_t)scn->scn_restart_txg); /* * Load the queue obj from the old location so that it @@ -172,7 +474,14 @@ else if (err) return (err); - if (scn->scn_phys.scn_state == DSS_SCANNING && + /* + * We might be restarting after a reboot, so jump the issued + * counter to how far we've scanned. We know we're consistent + * up to here. + */ + scn->scn_issued_before_pass = scn->scn_phys.scn_examined; + + if (dsl_scan_is_running(scn) && spa_prev_software_version(dp->dp_spa) < SPA_VERSION_SCAN) { /* * A new-type scrub was in progress on an old @@ -184,10 +493,26 @@ scn->scn_restart_txg = txg; zfs_dbgmsg("new-style scrub was modified " "by old software; restarting in txg %llu", - scn->scn_restart_txg); + (longlong_t)scn->scn_restart_txg); } } + /* reload the queue into the in-core state */ + if (scn->scn_phys.scn_queue_obj != 0) { + zap_cursor_t zc; + zap_attribute_t za; + + for (zap_cursor_init(&zc, dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj); + zap_cursor_retrieve(&zc, &za) == 0; + (void) zap_cursor_advance(&zc)) { + scan_ds_queue_insert(scn, + zfs_strtonum(za.za_name, NULL), + za.za_first_integer); + } + zap_cursor_fini(&zc); + } + spa_scan_stat_init(spa); return (0); } @@ -195,19 +520,116 @@ void dsl_scan_fini(dsl_pool_t *dp) { - if (dp->dp_scan) { + if (dp->dp_scan != NULL) { + dsl_scan_t *scn = dp->dp_scan; + + if (scn->scn_taskq != NULL) + taskq_destroy(scn->scn_taskq); + scan_ds_queue_clear(scn); + avl_destroy(&scn->scn_queue); + avl_destroy(&scn->scn_prefetch_queue); + kmem_free(dp->dp_scan, sizeof (dsl_scan_t)); dp->dp_scan = NULL; } } +static boolean_t +dsl_scan_restarting(dsl_scan_t *scn, dmu_tx_t *tx) +{ + return (scn->scn_restart_txg != 0 && + scn->scn_restart_txg <= tx->tx_txg); +} + +boolean_t +dsl_scan_scrubbing(const dsl_pool_t *dp) +{ + dsl_scan_phys_t *scn_phys = &dp->dp_scan->scn_phys; + + return (scn_phys->scn_state == DSS_SCANNING && + scn_phys->scn_func == POOL_SCAN_SCRUB); +} + +boolean_t +dsl_scan_is_paused_scrub(const dsl_scan_t *scn) +{ + return (dsl_scan_scrubbing(scn->scn_dp) && + scn->scn_phys.scn_flags & DSF_SCRUB_PAUSED); +} + +/* + * Writes out a persistent dsl_scan_phys_t record to the pool directory. + * Because we can be running in the block sorting algorithm, we do not always + * want to write out the record, only when it is "safe" to do so. This safety + * condition is achieved by making sure that the sorting queues are empty + * (scn_bytes_pending == 0). When this condition is not true, the sync'd state + * is inconsistent with how much actual scanning progress has been made. The + * kind of sync to be performed is specified by the sync_type argument. If the + * sync is optional, we only sync if the queues are empty. If the sync is + * mandatory, we do a hard ASSERT to make sure that the queues are empty. The + * third possible state is a "cached" sync. This is done in response to: + * 1) The dataset that was in the last sync'd dsl_scan_phys_t having been + * destroyed, so we wouldn't be able to restart scanning from it. + * 2) The snapshot that was in the last sync'd dsl_scan_phys_t having been + * superseded by a newer snapshot. + * 3) The dataset that was in the last sync'd dsl_scan_phys_t having been + * swapped with its clone. + * In all cases, a cached sync simply rewrites the last record we've written, + * just slightly modified. For the modifications that are performed to the + * last written dsl_scan_phys_t, see dsl_scan_ds_destroyed, + * dsl_scan_ds_snapshotted and dsl_scan_ds_clone_swapped. + */ +static void +dsl_scan_sync_state(dsl_scan_t *scn, dmu_tx_t *tx, state_sync_type_t sync_type) +{ + int i; + spa_t *spa = scn->scn_dp->dp_spa; + + ASSERT(sync_type != SYNC_MANDATORY || scn->scn_bytes_pending == 0); + if (scn->scn_bytes_pending == 0) { + for (i = 0; i < spa->spa_root_vdev->vdev_children; i++) { + vdev_t *vd = spa->spa_root_vdev->vdev_child[i]; + dsl_scan_io_queue_t *q = vd->vdev_scan_io_queue; + + if (q == NULL) + continue; + + mutex_enter(&vd->vdev_scan_io_queue_lock); + ASSERT3P(avl_first(&q->q_sios_by_addr), ==, NULL); + ASSERT3P(avl_first(&q->q_exts_by_size), ==, NULL); + ASSERT3P(range_tree_first(q->q_exts_by_addr), ==, NULL); + mutex_exit(&vd->vdev_scan_io_queue_lock); + } + + if (scn->scn_phys.scn_queue_obj != 0) + scan_ds_queue_sync(scn, tx); + VERIFY0(zap_update(scn->scn_dp->dp_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS, + &scn->scn_phys, tx)); + bcopy(&scn->scn_phys, &scn->scn_phys_cached, + sizeof (scn->scn_phys)); + + if (scn->scn_checkpointing) + zfs_dbgmsg("finish scan checkpoint"); + + scn->scn_checkpointing = B_FALSE; + scn->scn_last_checkpoint = ddi_get_lbolt(); + } else if (sync_type == SYNC_CACHED) { + VERIFY0(zap_update(scn->scn_dp->dp_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS, + &scn->scn_phys_cached, tx)); + } +} + /* ARGSUSED */ static int dsl_scan_setup_check(void *arg, dmu_tx_t *tx) { dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; - if (scn->scn_phys.scn_state == DSS_SCANNING) + if (dsl_scan_is_running(scn)) return (SET_ERROR(EBUSY)); return (0); @@ -222,7 +644,7 @@ dsl_pool_t *dp = scn->scn_dp; spa_t *spa = dp->dp_spa; - ASSERT(scn->scn_phys.scn_state != DSS_SCANNING); + ASSERT(!dsl_scan_is_running(scn)); ASSERT(*funcp > POOL_SCAN_NONE && *funcp < POOL_SCAN_FUNCS); bzero(&scn->scn_phys, sizeof (scn->scn_phys)); scn->scn_phys.scn_func = *funcp; @@ -233,8 +655,11 @@ scn->scn_phys.scn_start_time = gethrestime_sec(); scn->scn_phys.scn_errors = 0; scn->scn_phys.scn_to_examine = spa->spa_root_vdev->vdev_stat.vs_alloc; + scn->scn_issued_before_pass = 0; scn->scn_restart_txg = 0; scn->scn_done_txg = 0; + scn->scn_last_checkpoint = 0; + scn->scn_checkpointing = B_FALSE; spa_scan_stat_init(spa); if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) { @@ -267,8 +692,10 @@ if (dp->dp_blkstats == NULL) { dp->dp_blkstats = kmem_alloc(sizeof (zfs_all_blkstats_t), KM_SLEEP); + mutex_init(&dp->dp_blkstats->zab_lock, NULL, + MUTEX_DEFAULT, NULL); } - bzero(dp->dp_blkstats, sizeof (zfs_all_blkstats_t)); + bzero(&dp->dp_blkstats->zab_type, sizeof (dp->dp_blkstats->zab_type)); if (spa_version(spa) < SPA_VERSION_DSL_SCRUB) ot = DMU_OT_ZAP_OTHER; @@ -276,13 +703,52 @@ scn->scn_phys.scn_queue_obj = zap_create(dp->dp_meta_objset, ot ? ot : DMU_OT_SCAN_QUEUE, DMU_OT_NONE, 0, tx); - dsl_scan_sync_state(scn, tx); + bcopy(&scn->scn_phys, &scn->scn_phys_cached, sizeof (scn->scn_phys)); + dsl_scan_sync_state(scn, tx, SYNC_MANDATORY); + spa_history_log_internal(spa, "scan setup", tx, "func=%u mintxg=%llu maxtxg=%llu", *funcp, scn->scn_phys.scn_min_txg, scn->scn_phys.scn_max_txg); } +/* + * Called by the ZFS_IOC_POOL_SCAN ioctl to start a scrub or resilver. + * Can also be called to resume a paused scrub. + */ +int +dsl_scan(dsl_pool_t *dp, pool_scan_func_t func) +{ + spa_t *spa = dp->dp_spa; + dsl_scan_t *scn = dp->dp_scan; + + /* + * Purge all vdev caches and probe all devices. We do this here + * rather than in sync context because this requires a writer lock + * on the spa_config lock, which we can't do from sync context. The + * spa_scrub_reopen flag indicates that vdev_open() should not + * attempt to start another scrub. + */ + spa_vdev_state_enter(spa, SCL_NONE); + spa->spa_scrub_reopen = B_TRUE; + vdev_reopen(spa->spa_root_vdev); + spa->spa_scrub_reopen = B_FALSE; + (void) spa_vdev_state_exit(spa, NULL, 0); + + if (func == POOL_SCAN_SCRUB && dsl_scan_is_paused_scrub(scn)) { + /* got scrub start cmd, resume paused scrub */ + int err = dsl_scrub_set_pause_resume(scn->scn_dp, + POOL_SCRUB_NORMAL); + if (err == 0) + return (ECANCELED); + + return (SET_ERROR(err)); + } + + return (dsl_sync_task(spa_name(spa), dsl_scan_setup_check, + dsl_scan_setup_sync, &func, 0, ZFS_SPACE_CHECK_NONE)); +} + /* ARGSUSED */ static void dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx) @@ -310,10 +776,11 @@ } if (scn->scn_phys.scn_queue_obj != 0) { - VERIFY(0 == dmu_object_free(dp->dp_meta_objset, + VERIFY0(dmu_object_free(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, tx)); scn->scn_phys.scn_queue_obj = 0; } + scan_ds_queue_clear(scn); scn->scn_phys.scn_flags &= ~DSF_SCRUB_PAUSED; @@ -321,14 +788,23 @@ * If we were "restarted" from a stopped state, don't bother * with anything else. */ - if (scn->scn_phys.scn_state != DSS_SCANNING) + if (!dsl_scan_is_running(scn)) { + ASSERT(!scn->scn_is_sorted); return; + } - if (complete) - scn->scn_phys.scn_state = DSS_FINISHED; - else - scn->scn_phys.scn_state = DSS_CANCELED; + if (scn->scn_is_sorted) { + scan_io_queues_destroy(scn); + scn->scn_is_sorted = B_FALSE; + if (scn->scn_taskq != NULL) { + taskq_destroy(scn->scn_taskq); + scn->scn_taskq = NULL; + } + } + + scn->scn_phys.scn_state = complete ? DSS_FINISHED : DSS_CANCELED; + if (dsl_scan_restarting(scn, tx)) spa_history_log_internal(spa, "scan aborted, restarting", tx, "errors=%llu", spa_get_errlog_size(spa)); @@ -340,12 +816,6 @@ "errors=%llu", spa_get_errlog_size(spa)); if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) { - mutex_enter(&spa->spa_scrub_lock); - while (spa->spa_scrub_inflight > 0) { - cv_wait(&spa->spa_scrub_io_cv, - &spa->spa_scrub_lock); - } - mutex_exit(&spa->spa_scrub_lock); spa->spa_scrub_started = B_FALSE; spa->spa_scrub_active = B_FALSE; @@ -381,6 +851,8 @@ } scn->scn_phys.scn_end_time = gethrestime_sec(); + + ASSERT(!dsl_scan_is_running(scn)); } /* ARGSUSED */ @@ -389,7 +861,7 @@ { dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; - if (scn->scn_phys.scn_state != DSS_SCANNING) + if (!dsl_scan_is_running(scn)) return (SET_ERROR(ENOENT)); return (0); } @@ -401,7 +873,7 @@ dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; dsl_scan_done(scn, B_FALSE, tx); - dsl_scan_sync_state(scn, tx); + dsl_scan_sync_state(scn, tx, SYNC_MANDATORY); spa_event_notify(scn->scn_dp->dp_spa, NULL, NULL, ESC_ZFS_SCRUB_ABORT); } @@ -412,16 +884,6 @@ dsl_scan_cancel_sync, NULL, 3, ZFS_SPACE_CHECK_RESERVED)); } -boolean_t -dsl_scan_is_paused_scrub(const dsl_scan_t *scn) -{ - if (dsl_scan_scrubbing(scn->scn_dp) && - scn->scn_phys.scn_flags & DSF_SCRUB_PAUSED) - return (B_TRUE); - - return (B_FALSE); -} - static int dsl_scrub_pause_resume_check(void *arg, dmu_tx_t *tx) { @@ -456,7 +918,7 @@ /* can't pause a scrub when there is no in-progress scrub */ spa->spa_scan_pass_scrub_pause = gethrestime_sec(); scn->scn_phys.scn_flags |= DSF_SCRUB_PAUSED; - dsl_scan_sync_state(scn, tx); + dsl_scan_sync_state(scn, tx, SYNC_CACHED); spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_PAUSED); } else { ASSERT3U(*cmd, ==, POOL_SCRUB_NORMAL); @@ -470,7 +932,7 @@ gethrestime_sec() - spa->spa_scan_pass_scrub_pause; spa->spa_scan_pass_scrub_pause = 0; scn->scn_phys.scn_flags &= ~DSF_SCRUB_PAUSED; - dsl_scan_sync_state(scn, tx); + dsl_scan_sync_state(scn, tx, SYNC_CACHED); } } } @@ -486,25 +948,25 @@ ZFS_SPACE_CHECK_RESERVED)); } -boolean_t -dsl_scan_scrubbing(const dsl_pool_t *dp) + +/* start a new scan, or restart an existing one. */ +void +dsl_resilver_restart(dsl_pool_t *dp, uint64_t txg) { - dsl_scan_t *scn = dp->dp_scan; + if (txg == 0) { + dmu_tx_t *tx; + tx = dmu_tx_create_dd(dp->dp_mos_dir); + VERIFY(0 == dmu_tx_assign(tx, TXG_WAIT)); - if (scn->scn_phys.scn_state == DSS_SCANNING && - scn->scn_phys.scn_func == POOL_SCAN_SCRUB) - return (B_TRUE); - - return (B_FALSE); + txg = dmu_tx_get_txg(tx); + dp->dp_scan->scn_restart_txg = txg; + dmu_tx_commit(tx); + } else { + dp->dp_scan->scn_restart_txg = txg; + } + zfs_dbgmsg("restarting resilver txg=%llu", txg); } -static void dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb, - dnode_phys_t *dnp, dsl_dataset_t *ds, dsl_scan_t *scn, - dmu_objset_type_t ostype, dmu_tx_t *tx); -static void dsl_scan_visitdnode(dsl_scan_t *, dsl_dataset_t *ds, - dmu_objset_type_t ostype, - dnode_phys_t *dnp, uint64_t object, dmu_tx_t *tx); - void dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bp) { @@ -519,27 +981,171 @@ pio->io_flags)); } -static uint64_t -dsl_scan_ds_maxtxg(dsl_dataset_t *ds) +static int +scan_ds_queue_compare(const void *a, const void *b) { - uint64_t smt = ds->ds_dir->dd_pool->dp_scan->scn_phys.scn_max_txg; - if (ds->ds_is_snapshot) - return (MIN(smt, dsl_dataset_phys(ds)->ds_creation_txg)); - return (smt); + const scan_ds_t *sds_a = a, *sds_b = b; + + if (sds_a->sds_dsobj < sds_b->sds_dsobj) + return (-1); + if (sds_a->sds_dsobj == sds_b->sds_dsobj) + return (0); + return (1); } static void -dsl_scan_sync_state(dsl_scan_t *scn, dmu_tx_t *tx) +scan_ds_queue_clear(dsl_scan_t *scn) { - VERIFY0(zap_update(scn->scn_dp->dp_meta_objset, - DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS, - &scn->scn_phys, tx)); + void *cookie = NULL; + scan_ds_t *sds; + while ((sds = avl_destroy_nodes(&scn->scn_queue, &cookie)) != NULL) { + kmem_free(sds, sizeof (*sds)); + } } -extern int zfs_vdev_async_write_active_min_dirty_percent; +static boolean_t +scan_ds_queue_contains(dsl_scan_t *scn, uint64_t dsobj, uint64_t *txg) +{ + scan_ds_t srch, *sds; + srch.sds_dsobj = dsobj; + sds = avl_find(&scn->scn_queue, &srch, NULL); + if (sds != NULL && txg != NULL) + *txg = sds->sds_txg; + return (sds != NULL); +} + +static void +scan_ds_queue_insert(dsl_scan_t *scn, uint64_t dsobj, uint64_t txg) +{ + scan_ds_t *sds; + avl_index_t where; + + sds = kmem_zalloc(sizeof (*sds), KM_SLEEP); + sds->sds_dsobj = dsobj; + sds->sds_txg = txg; + + VERIFY3P(avl_find(&scn->scn_queue, sds, &where), ==, NULL); + avl_insert(&scn->scn_queue, sds, where); +} + +static void +scan_ds_queue_remove(dsl_scan_t *scn, uint64_t dsobj) +{ + scan_ds_t srch, *sds; + + srch.sds_dsobj = dsobj; + + sds = avl_find(&scn->scn_queue, &srch, NULL); + VERIFY(sds != NULL); + avl_remove(&scn->scn_queue, sds); + kmem_free(sds, sizeof (*sds)); +} + +static void +scan_ds_queue_sync(dsl_scan_t *scn, dmu_tx_t *tx) +{ + dsl_pool_t *dp = scn->scn_dp; + spa_t *spa = dp->dp_spa; + dmu_object_type_t ot = (spa_version(spa) >= SPA_VERSION_DSL_SCRUB) ? + DMU_OT_SCAN_QUEUE : DMU_OT_ZAP_OTHER; + + ASSERT0(scn->scn_bytes_pending); + ASSERT(scn->scn_phys.scn_queue_obj != 0); + + VERIFY0(dmu_object_free(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, tx)); + scn->scn_phys.scn_queue_obj = zap_create(dp->dp_meta_objset, ot, + DMU_OT_NONE, 0, tx); + for (scan_ds_t *sds = avl_first(&scn->scn_queue); + sds != NULL; sds = AVL_NEXT(&scn->scn_queue, sds)) { + VERIFY0(zap_add_int_key(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, sds->sds_dsobj, + sds->sds_txg, tx)); + } +} + +/* + * Computes the memory limit state that we're currently in. A sorted scan + * needs quite a bit of memory to hold the sorting queue, so we need to + * reasonably constrain the size so it doesn't impact overall system + * performance. We compute two limits: + * 1) Hard memory limit: if the amount of memory used by the sorting + * queues on a pool gets above this value, we stop the metadata + * scanning portion and start issuing the queued up and sorted + * I/Os to reduce memory usage. + * This limit is calculated as a fraction of physmem (by default 5%). + * We constrain the lower bound of the hard limit to an absolute + * minimum of zfs_scan_mem_lim_min (default: 16 MiB). We also constrain + * the upper bound to 5% of the total pool size - no chance we'll + * ever need that much memory, but just to keep the value in check. + * 2) Soft memory limit: once we hit the hard memory limit, we start + * issuing I/O to reduce queue memory usage, but we don't want to + * completely empty out the queues, since we might be able to find I/Os + * that will fill in the gaps of our non-sequential IOs at some point + * in the future. So we stop the issuing of I/Os once the amount of + * memory used drops below the soft limit (at which point we stop issuing + * I/O and start scanning metadata again). + * + * This limit is calculated by subtracting a fraction of the hard + * limit from the hard limit. By default this fraction is 5%, so + * the soft limit is 95% of the hard limit. We cap the size of the + * difference between the hard and soft limits at an absolute + * maximum of zfs_scan_mem_lim_soft_max (default: 128 MiB) - this is + * sufficient to not cause too frequent switching between the + * metadata scan and I/O issue (even at 2k recordsize, 128 MiB's + * worth of queues is about 1.2 GiB of on-pool data, so scanning + * that should take at least a decent fraction of a second). + */ static boolean_t +dsl_scan_should_clear(dsl_scan_t *scn) +{ + vdev_t *rvd = scn->scn_dp->dp_spa->spa_root_vdev; + uint64_t mlim_hard, mlim_soft, mused; + uint64_t alloc = metaslab_class_get_alloc(spa_normal_class( + scn->scn_dp->dp_spa)); + + mlim_hard = MAX((physmem / zfs_scan_mem_lim_fact) * PAGESIZE, + zfs_scan_mem_lim_min); + mlim_hard = MIN(mlim_hard, alloc / 20); + mlim_soft = mlim_hard - MIN(mlim_hard / zfs_scan_mem_lim_soft_fact, + zfs_scan_mem_lim_soft_max); + mused = 0; + for (uint64_t i = 0; i < rvd->vdev_children; i++) { + vdev_t *tvd = rvd->vdev_child[i]; + dsl_scan_io_queue_t *queue; + + mutex_enter(&tvd->vdev_scan_io_queue_lock); + queue = tvd->vdev_scan_io_queue; + if (queue != NULL) { + /* #extents in exts_by_size = # in exts_by_addr */ + mused += avl_numnodes(&queue->q_exts_by_size) * + sizeof (range_seg_t) + + avl_numnodes(&queue->q_sios_by_addr) * + sizeof (scan_io_t); + } + mutex_exit(&tvd->vdev_scan_io_queue_lock); + } + + dprintf("current scan memory usage: %llu bytes\n", (longlong_t)mused); + + if (mused == 0) + ASSERT0(scn->scn_bytes_pending); + + /* + * If we are above our hard limit, we need to clear out memory. + * If we are below our soft limit, we need to accumulate sequential IOs. + * Otherwise, we should keep doing whatever we are currently doing. + */ + if (mused >= mlim_hard) + return (B_TRUE); + else if (mused < mlim_soft) + return (B_FALSE); + else + return (scn->scn_clearing); +} + +static boolean_t dsl_scan_check_suspend(dsl_scan_t *scn, const zbookmark_phys_t *zb) { /* we never skip user/group accounting objects */ @@ -558,9 +1164,6 @@ /* * We suspend if: - * - we have scanned for the maximum time: an entire txg - * timeout (default 5 sec) - * or * - we have scanned for at least the minimum time (default 1 sec * for scrub, 3 sec for resilver), and either we have sufficient * dirty data that we are starting to write more quickly @@ -569,16 +1172,25 @@ * or * - the spa is shutting down because this pool is being exported * or the machine is rebooting. + * or + * - the scan queue has reached its memory use limit */ - int mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ? - zfs_resilver_min_time_ms : zfs_scan_min_time_ms; - uint64_t elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time; + uint64_t elapsed_nanosecs = gethrtime(); + uint64_t curr_time_ns = gethrtime(); + uint64_t scan_time_ns = curr_time_ns - scn->scn_sync_start_time; + uint64_t sync_time_ns = curr_time_ns - + scn->scn_dp->dp_spa->spa_sync_starttime; + int dirty_pct = scn->scn_dp->dp_dirty_total * 100 / zfs_dirty_data_max; - if (elapsed_nanosecs / NANOSEC >= zfs_txg_timeout || - (NSEC2MSEC(elapsed_nanosecs) > mintime && - (txg_sync_waiting(scn->scn_dp) || - dirty_pct >= zfs_vdev_async_write_active_min_dirty_percent)) || - spa_shutting_down(scn->scn_dp->dp_spa)) { + int mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ? + zfs_resilver_min_time_ms : zfs_scrub_min_time_ms; + + if ((NSEC2MSEC(scan_time_ns) > mintime && + (dirty_pct >= zfs_vdev_async_write_active_min_dirty_percent || + txg_sync_waiting(scn->scn_dp) || + NSEC2SEC(sync_time_ns) >= zfs_txg_timeout)) || + spa_shutting_down(scn->scn_dp->dp_spa) || + (zfs_scan_strict_mem_lim && dsl_scan_should_clear(scn))) { if (zb) { dprintf("suspending at bookmark %llx/%llx/%llx/%llx\n", (longlong_t)zb->zb_objset, @@ -586,12 +1198,16 @@ (longlong_t)zb->zb_level, (longlong_t)zb->zb_blkid); scn->scn_phys.scn_bookmark = *zb; + } else { + dsl_scan_phys_t *scnp = &scn->scn_phys; + + dprintf("suspending at at DDT bookmark " + "%llx/%llx/%llx/%llx\n", + (longlong_t)scnp->scn_ddt_bookmark.ddb_class, + (longlong_t)scnp->scn_ddt_bookmark.ddb_type, + (longlong_t)scnp->scn_ddt_bookmark.ddb_checksum, + (longlong_t)scnp->scn_ddt_bookmark.ddb_cursor); } - dprintf("suspending at DDT bookmark %llx/%llx/%llx/%llx\n", - (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_class, - (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_type, - (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_checksum, - (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_cursor); scn->scn_suspending = B_TRUE; return (B_TRUE); } @@ -690,28 +1306,278 @@ zil_free(zilog); } -/* ARGSUSED */ +/* + * We compare scan_prefetch_issue_ctx_t's based on their bookmarks. The idea + * here is to sort the AVL tree by the order each block will be needed. + */ +static int +scan_prefetch_queue_compare(const void *a, const void *b) +{ + const scan_prefetch_issue_ctx_t *spic_a = a, *spic_b = b; + const scan_prefetch_ctx_t *spc_a = spic_a->spic_spc; + const scan_prefetch_ctx_t *spc_b = spic_b->spic_spc; + + return (zbookmark_compare(spc_a->spc_datablkszsec, + spc_a->spc_indblkshift, spc_b->spc_datablkszsec, + spc_b->spc_indblkshift, &spic_a->spic_zb, &spic_b->spic_zb)); +} + static void -dsl_scan_prefetch(dsl_scan_t *scn, arc_buf_t *buf, blkptr_t *bp, - uint64_t objset, uint64_t object, uint64_t blkid) +scan_prefetch_ctx_rele(scan_prefetch_ctx_t *spc, void *tag) { - zbookmark_phys_t czb; - arc_flags_t flags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH; + if (refcount_remove(&spc->spc_refcnt, tag) == 0) { + refcount_destroy(&spc->spc_refcnt); + kmem_free(spc, sizeof (scan_prefetch_ctx_t)); + } +} +static scan_prefetch_ctx_t * +scan_prefetch_ctx_create(dsl_scan_t *scn, dnode_phys_t *dnp, void *tag) +{ + scan_prefetch_ctx_t *spc; + + spc = kmem_alloc(sizeof (scan_prefetch_ctx_t), KM_SLEEP); + refcount_create(&spc->spc_refcnt); + refcount_add(&spc->spc_refcnt, tag); + spc->spc_scn = scn; + if (dnp != NULL) { + spc->spc_datablkszsec = dnp->dn_datablkszsec; + spc->spc_indblkshift = dnp->dn_indblkshift; + spc->spc_root = B_FALSE; + } else { + spc->spc_datablkszsec = 0; + spc->spc_indblkshift = 0; + spc->spc_root = B_TRUE; + } + + return (spc); +} + +static void +scan_prefetch_ctx_add_ref(scan_prefetch_ctx_t *spc, void *tag) +{ + refcount_add(&spc->spc_refcnt, tag); +} + +static boolean_t +dsl_scan_check_prefetch_resume(scan_prefetch_ctx_t *spc, + const zbookmark_phys_t *zb) +{ + zbookmark_phys_t *last_zb = &spc->spc_scn->scn_prefetch_bookmark; + dnode_phys_t tmp_dnp; + dnode_phys_t *dnp = (spc->spc_root) ? NULL : &tmp_dnp; + + if (zb->zb_objset != last_zb->zb_objset) + return (B_TRUE); + if ((int64_t)zb->zb_object < 0) + return (B_FALSE); + + tmp_dnp.dn_datablkszsec = spc->spc_datablkszsec; + tmp_dnp.dn_indblkshift = spc->spc_indblkshift; + + if (zbookmark_subtree_completed(dnp, zb, last_zb)) + return (B_TRUE); + + return (B_FALSE); +} + +static void +dsl_scan_prefetch(scan_prefetch_ctx_t *spc, blkptr_t *bp, zbookmark_phys_t *zb) +{ + avl_index_t idx; + dsl_scan_t *scn = spc->spc_scn; + spa_t *spa = scn->scn_dp->dp_spa; + scan_prefetch_issue_ctx_t *spic; + if (zfs_no_scrub_prefetch) return; - if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_min_txg || - (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE)) + if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_cur_min_txg || + (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE && + BP_GET_TYPE(bp) != DMU_OT_OBJSET)) return; - SET_BOOKMARK(&czb, objset, object, BP_GET_LEVEL(bp), blkid); + if (dsl_scan_check_prefetch_resume(spc, zb)) + return; - (void) arc_read(scn->scn_zio_root, scn->scn_dp->dp_spa, bp, - NULL, NULL, ZIO_PRIORITY_ASYNC_READ, - ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD, &flags, &czb); + scan_prefetch_ctx_add_ref(spc, scn); + spic = kmem_alloc(sizeof (scan_prefetch_issue_ctx_t), KM_SLEEP); + spic->spic_spc = spc; + spic->spic_bp = *bp; + spic->spic_zb = *zb; + + /* + * Add the IO to the queue of blocks to prefetch. This allows us to + * prioritize blocks that we will need first for the main traversal + * thread. + */ + mutex_enter(&spa->spa_scrub_lock); + if (avl_find(&scn->scn_prefetch_queue, spic, &idx) != NULL) { + /* this block is already queued for prefetch */ + kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t)); + scan_prefetch_ctx_rele(spc, scn); + mutex_exit(&spa->spa_scrub_lock); + return; + } + + avl_insert(&scn->scn_prefetch_queue, spic, idx); + cv_broadcast(&spa->spa_scrub_io_cv); + mutex_exit(&spa->spa_scrub_lock); } +static void +dsl_scan_prefetch_dnode(dsl_scan_t *scn, dnode_phys_t *dnp, + uint64_t objset, uint64_t object) +{ + int i; + zbookmark_phys_t zb; + scan_prefetch_ctx_t *spc; + + if (dnp->dn_nblkptr == 0 && !(dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) + return; + + SET_BOOKMARK(&zb, objset, object, 0, 0); + + spc = scan_prefetch_ctx_create(scn, dnp, FTAG); + + for (i = 0; i < dnp->dn_nblkptr; i++) { + zb.zb_level = BP_GET_LEVEL(&dnp->dn_blkptr[i]); + zb.zb_blkid = i; + dsl_scan_prefetch(spc, &dnp->dn_blkptr[i], &zb); + } + + if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { + zb.zb_level = 0; + zb.zb_blkid = DMU_SPILL_BLKID; + dsl_scan_prefetch(spc, &dnp->dn_spill, &zb); + } + + scan_prefetch_ctx_rele(spc, FTAG); +} + +void +dsl_scan_prefetch_cb(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, + arc_buf_t *buf, void *private) +{ + scan_prefetch_ctx_t *spc = private; + dsl_scan_t *scn = spc->spc_scn; + spa_t *spa = scn->scn_dp->dp_spa; + + /* broadcast that the IO has completed for rate limitting purposes */ + mutex_enter(&spa->spa_scrub_lock); + ASSERT3U(spa->spa_scrub_inflight, >=, BP_GET_PSIZE(bp)); + spa->spa_scrub_inflight -= BP_GET_PSIZE(bp); + cv_broadcast(&spa->spa_scrub_io_cv); + mutex_exit(&spa->spa_scrub_lock); + + /* if there was an error or we are done prefetching, just cleanup */ + if (buf == NULL || scn->scn_suspending) + goto out; + + if (BP_GET_LEVEL(bp) > 0) { + int i; + blkptr_t *cbp; + int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; + zbookmark_phys_t czb; + + for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) { + SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, + zb->zb_level - 1, zb->zb_blkid * epb + i); + dsl_scan_prefetch(spc, cbp, &czb); + } + } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { + dnode_phys_t *cdnp = buf->b_data; + int i; + int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; + + for (i = 0, cdnp = buf->b_data; i < epb; i++, cdnp++) { + dsl_scan_prefetch_dnode(scn, cdnp, + zb->zb_objset, zb->zb_blkid * epb + i); + } + } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { + objset_phys_t *osp = buf->b_data; + + dsl_scan_prefetch_dnode(scn, &osp->os_meta_dnode, + zb->zb_objset, DMU_META_DNODE_OBJECT); + + if (OBJSET_BUF_HAS_USERUSED(buf)) { + dsl_scan_prefetch_dnode(scn, + &osp->os_groupused_dnode, zb->zb_objset, + DMU_GROUPUSED_OBJECT); + dsl_scan_prefetch_dnode(scn, + &osp->os_userused_dnode, zb->zb_objset, + DMU_USERUSED_OBJECT); + } + } + +out: + if (buf != NULL) + arc_buf_destroy(buf, private); + scan_prefetch_ctx_rele(spc, scn); +} + +/* ARGSUSED */ +static void +dsl_scan_prefetch_thread(void *arg) +{ + dsl_scan_t *scn = arg; + spa_t *spa = scn->scn_dp->dp_spa; + vdev_t *rvd = spa->spa_root_vdev; + uint64_t maxinflight = rvd->vdev_children * zfs_top_maxinflight; + scan_prefetch_issue_ctx_t *spic; + + /* loop until we are told to stop */ + while (!scn->scn_prefetch_stop) { + arc_flags_t flags = ARC_FLAG_NOWAIT | + ARC_FLAG_PRESCIENT_PREFETCH | ARC_FLAG_PREFETCH; + int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD; + + mutex_enter(&spa->spa_scrub_lock); + + /* + * Wait until we have an IO to issue and are not above our + * maximum in flight limit. + */ + while (!scn->scn_prefetch_stop && + (avl_numnodes(&scn->scn_prefetch_queue) == 0 || + spa->spa_scrub_inflight >= scn->scn_maxinflight_bytes)) { + cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); + } + + /* recheck if we should stop since we waited for the cv */ + if (scn->scn_prefetch_stop) { + mutex_exit(&spa->spa_scrub_lock); + break; + } + + /* remove the prefetch IO from the tree */ + spic = avl_first(&scn->scn_prefetch_queue); + spa->spa_scrub_inflight += BP_GET_PSIZE(&spic->spic_bp); + avl_remove(&scn->scn_prefetch_queue, spic); + + mutex_exit(&spa->spa_scrub_lock); + + /* issue the prefetch asynchronously */ + (void) arc_read(scn->scn_zio_root, scn->scn_dp->dp_spa, + &spic->spic_bp, dsl_scan_prefetch_cb, spic->spic_spc, + ZIO_PRIORITY_SCRUB, zio_flags, &flags, &spic->spic_zb); + + kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t)); + } + + ASSERT(scn->scn_prefetch_stop); + + /* free any prefetches we didn't get to complete */ + mutex_enter(&spa->spa_scrub_lock); + while ((spic = avl_first(&scn->scn_prefetch_queue)) != NULL) { + avl_remove(&scn->scn_prefetch_queue, spic); + scan_prefetch_ctx_rele(spic->spic_spc, scn); + kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t)); + } + ASSERT0(avl_numnodes(&scn->scn_prefetch_queue)); + mutex_exit(&spa->spa_scrub_lock); +} + static boolean_t dsl_scan_check_resume(dsl_scan_t *scn, const dnode_phys_t *dnp, const zbookmark_phys_t *zb) @@ -748,6 +1614,13 @@ return (B_FALSE); } +static void dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb, + dnode_phys_t *dnp, dsl_dataset_t *ds, dsl_scan_t *scn, + dmu_objset_type_t ostype, dmu_tx_t *tx); +static void dsl_scan_visitdnode( + dsl_scan_t *, dsl_dataset_t *ds, dmu_objset_type_t ostype, + dnode_phys_t *dnp, uint64_t object, dmu_tx_t *tx); + /* * Return nonzero on i/o error. * Return new buf to write out in *bufp. @@ -769,16 +1642,12 @@ arc_buf_t *buf; err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf, - ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb); + ZIO_PRIORITY_SCRUB, zio_flags, &flags, zb); if (err) { scn->scn_phys.scn_errors++; return (err); } for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) { - dsl_scan_prefetch(scn, buf, cbp, zb->zb_objset, - zb->zb_object, zb->zb_blkid * epb + i); - } - for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) { zbookmark_phys_t czb; SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, @@ -791,24 +1660,17 @@ } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { arc_flags_t flags = ARC_FLAG_WAIT; dnode_phys_t *cdnp; - int i, j; + int i; int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; arc_buf_t *buf; err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf, - ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb); + ZIO_PRIORITY_SCRUB, zio_flags, &flags, zb); if (err) { scn->scn_phys.scn_errors++; return (err); } for (i = 0, cdnp = buf->b_data; i < epb; i++, cdnp++) { - for (j = 0; j < cdnp->dn_nblkptr; j++) { - blkptr_t *cbp = &cdnp->dn_blkptr[j]; - dsl_scan_prefetch(scn, buf, cbp, - zb->zb_objset, zb->zb_blkid * epb + i, j); - } - } - for (i = 0, cdnp = buf->b_data; i < epb; i++, cdnp++) { dsl_scan_visitdnode(scn, ds, ostype, cdnp, zb->zb_blkid * epb + i, tx); } @@ -820,7 +1682,7 @@ arc_buf_t *buf; err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf, - ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb); + ZIO_PRIORITY_SCRUB, zio_flags, &flags, zb); if (err) { scn->scn_phys.scn_errors++; return (err); @@ -886,20 +1748,14 @@ dmu_objset_type_t ostype, dmu_tx_t *tx) { dsl_pool_t *dp = scn->scn_dp; - arc_buf_t *buf = NULL; - blkptr_t bp_toread = *bp; + blkptr_t *bp_toread = NULL; - /* ASSERT(pbuf == NULL || arc_released(pbuf)); */ - if (dsl_scan_check_suspend(scn, zb)) return; if (dsl_scan_check_resume(scn, dnp, zb)) return; - if (BP_IS_HOLE(bp)) - return; - scn->scn_visited_this_txg++; dprintf_bp(bp, @@ -908,12 +1764,22 @@ zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid, bp); - if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) + if (BP_IS_HOLE(bp)) { + scn->scn_holes_this_txg++; return; + } - if (dsl_scan_recurse(scn, ds, ostype, dnp, &bp_toread, zb, tx) != 0) + if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) { + scn->scn_lt_min_this_txg++; return; + } + bp_toread = kmem_alloc(sizeof (blkptr_t), KM_SLEEP); + *bp_toread = *bp; + + if (dsl_scan_recurse(scn, ds, ostype, dnp, bp_toread, zb, tx) != 0) + return; + /* * If dsl_scan_ddt() has already visited this block, it will have * already done any translations or scrubbing, so don't call the @@ -921,8 +1787,8 @@ */ if (ddt_class_contains(dp->dp_spa, scn->scn_phys.scn_ddt_class_max, bp)) { - ASSERT(buf == NULL); - return; + scn->scn_ddt_contained_this_txg++; + goto out; } /* @@ -932,9 +1798,14 @@ * Don't scan it now unless we need to because something * under it was modified. */ - if (BP_PHYSICAL_BIRTH(bp) <= scn->scn_phys.scn_cur_max_txg) { - scan_funcs[scn->scn_phys.scn_func](dp, bp, zb); + if (BP_PHYSICAL_BIRTH(bp) > scn->scn_phys.scn_cur_max_txg) { + scn->scn_gt_max_this_txg++; + goto out; } + + scan_funcs[scn->scn_phys.scn_func](dp, bp, zb); +out: + kmem_free(bp_toread, sizeof (blkptr_t)); } static void @@ -942,26 +1813,33 @@ dmu_tx_t *tx) { zbookmark_phys_t zb; + scan_prefetch_ctx_t *spc; SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); - dsl_scan_visitbp(bp, &zb, NULL, - ds, scn, DMU_OST_NONE, tx); + if (ZB_IS_ZERO(&scn->scn_phys.scn_bookmark)) { + SET_BOOKMARK(&scn->scn_prefetch_bookmark, + zb.zb_objset, 0, 0, 0); + } else { + scn->scn_prefetch_bookmark = scn->scn_phys.scn_bookmark; + } + + scn->scn_objsets_visited_this_txg++; + + spc = scan_prefetch_ctx_create(scn, NULL, FTAG); + dsl_scan_prefetch(spc, bp, &zb); + scan_prefetch_ctx_rele(spc, FTAG); + + dsl_scan_visitbp(bp, &zb, NULL, ds, scn, DMU_OST_NONE, tx); + dprintf_ds(ds, "finished scan%s", ""); } -void -dsl_scan_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx) +static void +ds_destroyed_scn_phys(dsl_dataset_t *ds, dsl_scan_phys_t *scn_phys) { - dsl_pool_t *dp = ds->ds_dir->dd_pool; - dsl_scan_t *scn = dp->dp_scan; - uint64_t mintxg; - - if (scn->scn_phys.scn_state != DSS_SCANNING) - return; - - if (scn->scn_phys.scn_bookmark.zb_objset == ds->ds_object) { + if (scn_phys->scn_bookmark.zb_objset == ds->ds_object) { if (ds->ds_is_snapshot) { /* * Note: @@ -973,23 +1851,57 @@ * ignore it when we retraverse it in * dsl_scan_visitds(). */ - scn->scn_phys.scn_bookmark.zb_objset = + scn_phys->scn_bookmark.zb_objset = dsl_dataset_phys(ds)->ds_next_snap_obj; zfs_dbgmsg("destroying ds %llu; currently traversing; " "reset zb_objset to %llu", (u_longlong_t)ds->ds_object, (u_longlong_t)dsl_dataset_phys(ds)-> ds_next_snap_obj); - scn->scn_phys.scn_flags |= DSF_VISIT_DS_AGAIN; + scn_phys->scn_flags |= DSF_VISIT_DS_AGAIN; } else { - SET_BOOKMARK(&scn->scn_phys.scn_bookmark, + SET_BOOKMARK(&scn_phys->scn_bookmark, ZB_DESTROYED_OBJSET, 0, 0, 0); zfs_dbgmsg("destroying ds %llu; currently traversing; " "reset bookmark to -1,0,0,0", (u_longlong_t)ds->ds_object); } - } else if (zap_lookup_int_key(dp->dp_meta_objset, - scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) { + } +} + +/* + * Invoked when a dataset is destroyed. We need to make sure that: + * + * 1) If it is the dataset that was currently being scanned, we write + * a new dsl_scan_phys_t and marking the objset reference in it + * as destroyed. + * 2) Remove it from the work queue, if it was present. + * + * If the dataset was actually a snapshot, instead of marking the dataset + * as destroyed, we instead substitute the next snapshot in line. + */ +void +dsl_scan_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx) +{ + dsl_pool_t *dp = ds->ds_dir->dd_pool; + dsl_scan_t *scn = dp->dp_scan; + uint64_t mintxg; + + if (!dsl_scan_is_running(scn)) + return; + + ds_destroyed_scn_phys(ds, &scn->scn_phys); + ds_destroyed_scn_phys(ds, &scn->scn_phys_cached); + + if (scan_ds_queue_contains(scn, ds->ds_object, &mintxg)) { + scan_ds_queue_remove(scn, ds->ds_object); + if (ds->ds_is_snapshot) + scan_ds_queue_insert(scn, + dsl_dataset_phys(ds)->ds_next_snap_obj, mintxg); + } + + if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, + ds->ds_object, &mintxg) == 0) { ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1); VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds->ds_object, tx)); @@ -1018,9 +1930,28 @@ * dsl_scan_sync() should be called after this, and should sync * out our changed state, but just to be safe, do it here. */ - dsl_scan_sync_state(scn, tx); + dsl_scan_sync_state(scn, tx, SYNC_CACHED); } +static void +ds_snapshotted_bookmark(dsl_dataset_t *ds, zbookmark_phys_t *scn_bookmark) +{ + if (scn_bookmark->zb_objset == ds->ds_object) { + scn_bookmark->zb_objset = + dsl_dataset_phys(ds)->ds_prev_snap_obj; + zfs_dbgmsg("snapshotting ds %llu; currently traversing; " + "reset zb_objset to %llu", + (u_longlong_t)ds->ds_object, + (u_longlong_t)dsl_dataset_phys(ds)->ds_prev_snap_obj); + } +} + +/* + * Called when a dataset is snapshotted. If we were currently traversing + * this snapshot, we reset our bookmark to point at the newly created + * snapshot. We also modify our work queue to remove the old snapshot and + * replace with the new one. + */ void dsl_scan_ds_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx) { @@ -1028,20 +1959,22 @@ dsl_scan_t *scn = dp->dp_scan; uint64_t mintxg; - if (scn->scn_phys.scn_state != DSS_SCANNING) + if (!dsl_scan_is_running(scn)) return; ASSERT(dsl_dataset_phys(ds)->ds_prev_snap_obj != 0); - if (scn->scn_phys.scn_bookmark.zb_objset == ds->ds_object) { - scn->scn_phys.scn_bookmark.zb_objset = - dsl_dataset_phys(ds)->ds_prev_snap_obj; - zfs_dbgmsg("snapshotting ds %llu; currently traversing; " - "reset zb_objset to %llu", - (u_longlong_t)ds->ds_object, - (u_longlong_t)dsl_dataset_phys(ds)->ds_prev_snap_obj); - } else if (zap_lookup_int_key(dp->dp_meta_objset, - scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) { + ds_snapshotted_bookmark(ds, &scn->scn_phys.scn_bookmark); + ds_snapshotted_bookmark(ds, &scn->scn_phys_cached.scn_bookmark); + + if (scan_ds_queue_contains(scn, ds->ds_object, &mintxg)) { + scan_ds_queue_remove(scn, ds->ds_object); + scan_ds_queue_insert(scn, + dsl_dataset_phys(ds)->ds_prev_snap_obj, mintxg); + } + + if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, + ds->ds_object, &mintxg) == 0) { VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds->ds_object, tx)); VERIFY(zap_add_int_key(dp->dp_meta_objset, @@ -1052,37 +1985,59 @@ (u_longlong_t)ds->ds_object, (u_longlong_t)dsl_dataset_phys(ds)->ds_prev_snap_obj); } - dsl_scan_sync_state(scn, tx); + + dsl_scan_sync_state(scn, tx, SYNC_CACHED); } -void -dsl_scan_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx) +static void +ds_clone_swapped_bookmark(dsl_dataset_t *ds1, dsl_dataset_t *ds2, + zbookmark_phys_t *scn_bookmark) { - dsl_pool_t *dp = ds1->ds_dir->dd_pool; - dsl_scan_t *scn = dp->dp_scan; - uint64_t mintxg; - - if (scn->scn_phys.scn_state != DSS_SCANNING) - return; - - if (scn->scn_phys.scn_bookmark.zb_objset == ds1->ds_object) { - scn->scn_phys.scn_bookmark.zb_objset = ds2->ds_object; + if (scn_bookmark->zb_objset == ds1->ds_object) { + scn_bookmark->zb_objset = ds2->ds_object; zfs_dbgmsg("clone_swap ds %llu; currently traversing; " "reset zb_objset to %llu", (u_longlong_t)ds1->ds_object, (u_longlong_t)ds2->ds_object); - } else if (scn->scn_phys.scn_bookmark.zb_objset == ds2->ds_object) { - scn->scn_phys.scn_bookmark.zb_objset = ds1->ds_object; + } else if (scn_bookmark->zb_objset == ds2->ds_object) { + scn_bookmark->zb_objset = ds1->ds_object; zfs_dbgmsg("clone_swap ds %llu; currently traversing; " "reset zb_objset to %llu", (u_longlong_t)ds2->ds_object, (u_longlong_t)ds1->ds_object); } +} +/* + * Called when a parent dataset and its clone are swapped. If we were + * currently traversing the dataset, we need to switch to traversing the + * newly promoted parent. + */ +void +dsl_scan_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx) +{ + dsl_pool_t *dp = ds1->ds_dir->dd_pool; + dsl_scan_t *scn = dp->dp_scan; + uint64_t mintxg; + + if (!dsl_scan_is_running(scn)) + return; + + ds_clone_swapped_bookmark(ds1, ds2, &scn->scn_phys.scn_bookmark); + ds_clone_swapped_bookmark(ds1, ds2, &scn->scn_phys_cached.scn_bookmark); + + if (scan_ds_queue_contains(scn, ds1->ds_object, &mintxg)) { + scan_ds_queue_remove(scn, ds1->ds_object); + scan_ds_queue_insert(scn, ds2->ds_object, mintxg); + } + if (scan_ds_queue_contains(scn, ds2->ds_object, &mintxg)) { + scan_ds_queue_remove(scn, ds2->ds_object); + scan_ds_queue_insert(scn, ds1->ds_object, mintxg); + } + if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds1->ds_object, &mintxg) == 0) { int err; - ASSERT3U(mintxg, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg); ASSERT3U(mintxg, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg); VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, @@ -1100,8 +2055,9 @@ "replacing with %llu", (u_longlong_t)ds1->ds_object, (u_longlong_t)ds2->ds_object); - } else if (zap_lookup_int_key(dp->dp_meta_objset, - scn->scn_phys.scn_queue_obj, ds2->ds_object, &mintxg) == 0) { + } + if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, + ds2->ds_object, &mintxg) == 0) { ASSERT3U(mintxg, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg); ASSERT3U(mintxg, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg); VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, @@ -1114,31 +2070,26 @@ (u_longlong_t)ds1->ds_object); } - dsl_scan_sync_state(scn, tx); + dsl_scan_sync_state(scn, tx, SYNC_CACHED); } -struct enqueue_clones_arg { - dmu_tx_t *tx; - uint64_t originobj; -}; - /* ARGSUSED */ static int enqueue_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) { - struct enqueue_clones_arg *eca = arg; + uint64_t originobj = *(uint64_t *)arg; dsl_dataset_t *ds; int err; dsl_scan_t *scn = dp->dp_scan; - if (dsl_dir_phys(hds->ds_dir)->dd_origin_obj != eca->originobj) + if (dsl_dir_phys(hds->ds_dir)->dd_origin_obj != originobj) return (0); err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds); if (err) return (err); - while (dsl_dataset_phys(ds)->ds_prev_snap_obj != eca->originobj) { + while (dsl_dataset_phys(ds)->ds_prev_snap_obj != originobj) { dsl_dataset_t *prev; err = dsl_dataset_hold_obj(dp, dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev); @@ -1148,9 +2099,8 @@ return (err); ds = prev; } - VERIFY(zap_add_int_key(dp->dp_meta_objset, - scn->scn_phys.scn_queue_obj, ds->ds_object, - dsl_dataset_phys(ds)->ds_prev_snap_txg, eca->tx) == 0); + scan_ds_queue_insert(scn, ds->ds_object, + dsl_dataset_phys(ds)->ds_prev_snap_txg); dsl_dataset_rele(ds, FTAG); return (0); } @@ -1160,6 +2110,7 @@ { dsl_pool_t *dp = scn->scn_dp; dsl_dataset_t *ds; + objset_t *os; VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); @@ -1195,14 +2146,17 @@ dsl_dataset_name(ds, dsname); zfs_dbgmsg("scanning dataset %llu (%s) is unnecessary because " "cur_min_txg (%llu) >= max_txg (%llu)", - dsobj, dsname, - scn->scn_phys.scn_cur_min_txg, - scn->scn_phys.scn_max_txg); + (longlong_t)dsobj, dsname, + (longlong_t)scn->scn_phys.scn_cur_min_txg, + (longlong_t)scn->scn_phys.scn_max_txg); kmem_free(dsname, MAXNAMELEN); goto out; } + if (dmu_objset_from_ds(ds, &os)) + goto out; + /* * Only the ZIL in the head (non-snapshot) is valid. Even though * snapshots can have ZIL block pointers (which may be the same @@ -1212,14 +2166,8 @@ * rather than in scan_recurse(), because the regular snapshot * block-sharing rules don't apply to it. */ - if (DSL_SCAN_IS_SCRUB_RESILVER(scn) && !dsl_dataset_is_snapshot(ds) && - ds->ds_dir != dp->dp_origin_snap->ds_dir) { - objset_t *os; - if (dmu_objset_from_ds(ds, &os) != 0) { - goto out; - } + if (!ds->ds_is_snapshot) dsl_scan_zil(dp, &os->os_zil_header); - } /* * Iterate over the bps in this ds. @@ -1252,9 +2200,8 @@ if (scn->scn_phys.scn_flags & DSF_VISIT_DS_AGAIN) { zfs_dbgmsg("incomplete pass; visiting again"); scn->scn_phys.scn_flags &= ~DSF_VISIT_DS_AGAIN; - VERIFY(zap_add_int_key(dp->dp_meta_objset, - scn->scn_phys.scn_queue_obj, ds->ds_object, - scn->scn_phys.scn_cur_max_txg, tx) == 0); + scan_ds_queue_insert(scn, ds->ds_object, + scn->scn_phys.scn_cur_max_txg); goto out; } @@ -1262,10 +2209,9 @@ * Add descendent datasets to work queue. */ if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0) { - VERIFY(zap_add_int_key(dp->dp_meta_objset, - scn->scn_phys.scn_queue_obj, + scan_ds_queue_insert(scn, dsl_dataset_phys(ds)->ds_next_snap_obj, - dsl_dataset_phys(ds)->ds_creation_txg, tx) == 0); + dsl_dataset_phys(ds)->ds_creation_txg); } if (dsl_dataset_phys(ds)->ds_num_children > 1) { boolean_t usenext = B_FALSE; @@ -1286,17 +2232,21 @@ } if (usenext) { - VERIFY0(zap_join_key(dp->dp_meta_objset, - dsl_dataset_phys(ds)->ds_next_clones_obj, - scn->scn_phys.scn_queue_obj, - dsl_dataset_phys(ds)->ds_creation_txg, tx)); + zap_cursor_t zc; + zap_attribute_t za; + for (zap_cursor_init(&zc, dp->dp_meta_objset, + dsl_dataset_phys(ds)->ds_next_clones_obj); + zap_cursor_retrieve(&zc, &za) == 0; + (void) zap_cursor_advance(&zc)) { + scan_ds_queue_insert(scn, + zfs_strtonum(za.za_name, NULL), + dsl_dataset_phys(ds)->ds_creation_txg); + } + zap_cursor_fini(&zc); } else { - struct enqueue_clones_arg eca; - eca.tx = tx; - eca.originobj = ds->ds_object; - VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, - enqueue_clones_cb, &eca, DS_FIND_CHILDREN)); + enqueue_clones_cb, &ds->ds_object, + DS_FIND_CHILDREN)); } } @@ -1308,7 +2258,6 @@ static int enqueue_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) { - dmu_tx_t *tx = arg; dsl_dataset_t *ds; int err; dsl_scan_t *scn = dp->dp_scan; @@ -1338,12 +2287,37 @@ ds = prev; } - VERIFY(zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, - ds->ds_object, dsl_dataset_phys(ds)->ds_prev_snap_txg, tx) == 0); + scan_ds_queue_insert(scn, ds->ds_object, + dsl_dataset_phys(ds)->ds_prev_snap_txg); dsl_dataset_rele(ds, FTAG); return (0); } +/* ARGSUSED */ +void +dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum, + ddt_entry_t *dde, dmu_tx_t *tx) +{ + const ddt_key_t *ddk = &dde->dde_key; + ddt_phys_t *ddp = dde->dde_phys; + blkptr_t bp; + zbookmark_phys_t zb = { 0 }; + int p; + + if (scn->scn_phys.scn_state != DSS_SCANNING) + return; + + for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { + if (ddp->ddp_phys_birth == 0 || + ddp->ddp_phys_birth > scn->scn_phys.scn_max_txg) + continue; + ddt_bp_create(checksum, ddk, ddp, &bp); + + scn->scn_visited_this_txg++; + scan_funcs[scn->scn_phys.scn_func](scn->scn_dp, &bp, &zb); + } +} + /* * Scrub/dedup interaction. * @@ -1416,36 +2390,20 @@ ddb->ddb_class > scn->scn_phys.scn_ddt_class_max); } -/* ARGSUSED */ -void -dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum, - ddt_entry_t *dde, dmu_tx_t *tx) +static uint64_t +dsl_scan_ds_maxtxg(dsl_dataset_t *ds) { - const ddt_key_t *ddk = &dde->dde_key; - ddt_phys_t *ddp = dde->dde_phys; - blkptr_t bp; - zbookmark_phys_t zb = { 0 }; - - if (scn->scn_phys.scn_state != DSS_SCANNING) - return; - - for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { - if (ddp->ddp_phys_birth == 0 || - ddp->ddp_phys_birth > scn->scn_phys.scn_max_txg) - continue; - ddt_bp_create(checksum, ddk, ddp, &bp); - - scn->scn_visited_this_txg++; - scan_funcs[scn->scn_phys.scn_func](scn->scn_dp, &bp, &zb); - } + uint64_t smt = ds->ds_dir->dd_pool->dp_scan->scn_phys.scn_max_txg; + if (ds->ds_is_snapshot) + return (MIN(smt, dsl_dataset_phys(ds)->ds_creation_txg)); + return (smt); } static void dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx) { + scan_ds_t *sds; dsl_pool_t *dp = scn->scn_dp; - zap_cursor_t zc; - zap_attribute_t za; if (scn->scn_phys.scn_ddt_bookmark.ddb_class <= scn->scn_phys.scn_ddt_class_max) { @@ -1469,7 +2427,7 @@ if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) { VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, - enqueue_cb, tx, DS_FIND_CHILDREN)); + enqueue_cb, NULL, DS_FIND_CHILDREN)); } else { dsl_scan_visitds(scn, dp->dp_origin_snap->ds_object, tx); @@ -1477,40 +2435,42 @@ ASSERT(!scn->scn_suspending); } else if (scn->scn_phys.scn_bookmark.zb_objset != ZB_DESTROYED_OBJSET) { + uint64_t dsobj = scn->scn_phys.scn_bookmark.zb_objset; /* - * If we were suspended, continue from here. Note if the + * If we were suspended, continue from here. Note if the * ds we were suspended on was deleted, the zb_objset may * be -1, so we will skip this and find a new objset * below. */ - dsl_scan_visitds(scn, scn->scn_phys.scn_bookmark.zb_objset, tx); + dsl_scan_visitds(scn, dsobj, tx); if (scn->scn_suspending) return; } /* - * In case we were suspended right at the end of the ds, zero the + * In case we suspended right at the end of the ds, zero the * bookmark so we don't think that we're still trying to resume. */ bzero(&scn->scn_phys.scn_bookmark, sizeof (zbookmark_phys_t)); - /* keep pulling things out of the zap-object-as-queue */ - while (zap_cursor_init(&zc, dp->dp_meta_objset, - scn->scn_phys.scn_queue_obj), - zap_cursor_retrieve(&zc, &za) == 0) { + /* + * Keep pulling things out of the dataset avl queue. Updates to the + * persistent zap-object-as-queue happen only at checkpoints. + */ + while ((sds = avl_first(&scn->scn_queue)) != NULL) { dsl_dataset_t *ds; - uint64_t dsobj; + uint64_t dsobj = sds->sds_dsobj; + uint64_t txg = sds->sds_txg; - dsobj = zfs_strtonum(za.za_name, NULL); - VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, - scn->scn_phys.scn_queue_obj, dsobj, tx)); + /* dequeue and free the ds from the queue */ + scan_ds_queue_remove(scn, dsobj); + sds = NULL; /* must not be touched after removal */ - /* Set up min/max txg */ + /* Set up min / max txg */ VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); - if (za.za_first_integer != 0) { + if (txg != 0) { scn->scn_phys.scn_cur_min_txg = - MAX(scn->scn_phys.scn_min_txg, - za.za_first_integer); + MAX(scn->scn_phys.scn_min_txg, txg); } else { scn->scn_phys.scn_cur_min_txg = MAX(scn->scn_phys.scn_min_txg, @@ -1520,14 +2480,367 @@ dsl_dataset_rele(ds, FTAG); dsl_scan_visitds(scn, dsobj, tx); - zap_cursor_fini(&zc); if (scn->scn_suspending) return; } - zap_cursor_fini(&zc); + /* No more objsets to fetch, we're done */ + scn->scn_phys.scn_bookmark.zb_objset = ZB_DESTROYED_OBJSET; + ASSERT0(scn->scn_suspending); } +static uint64_t +dsl_scan_count_leaves(vdev_t *vd) +{ + uint64_t i, leaves = 0; + + /* we only count leaves that belong to the main pool and are readable */ + if (vd->vdev_islog || vd->vdev_isspare || + vd->vdev_isl2cache || !vdev_readable(vd)) + return (0); + + if (vd->vdev_ops->vdev_op_leaf) + return (1); + + for (i = 0; i < vd->vdev_children; i++) { + leaves += dsl_scan_count_leaves(vd->vdev_child[i]); + } + + return (leaves); +} + + +static void +scan_io_queues_update_zio_stats(dsl_scan_io_queue_t *q, const blkptr_t *bp) +{ + int i; + uint64_t cur_size = 0; + + for (i = 0; i < BP_GET_NDVAS(bp); i++) { + cur_size += DVA_GET_ASIZE(&bp->blk_dva[i]); + } + + q->q_total_zio_size_this_txg += cur_size; + q->q_zios_this_txg++; +} + +static void +scan_io_queues_update_seg_stats(dsl_scan_io_queue_t *q, uint64_t start, + uint64_t end) +{ + q->q_total_seg_size_this_txg += end - start; + q->q_segs_this_txg++; +} + static boolean_t +scan_io_queue_check_suspend(dsl_scan_t *scn) +{ + /* See comment in dsl_scan_check_suspend() */ + uint64_t curr_time_ns = gethrtime(); + uint64_t scan_time_ns = curr_time_ns - scn->scn_sync_start_time; + uint64_t sync_time_ns = curr_time_ns - + scn->scn_dp->dp_spa->spa_sync_starttime; + int dirty_pct = scn->scn_dp->dp_dirty_total * 100 / zfs_dirty_data_max; + int mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ? + zfs_resilver_min_time_ms : zfs_scrub_min_time_ms; + + return ((NSEC2MSEC(scan_time_ns) > mintime && + (dirty_pct >= zfs_vdev_async_write_active_min_dirty_percent || + txg_sync_waiting(scn->scn_dp) || + NSEC2SEC(sync_time_ns) >= zfs_txg_timeout)) || + spa_shutting_down(scn->scn_dp->dp_spa)); +} + +/* + * Given a list of scan_io_t's in io_list, this issues the io's out to + * disk. This consumes the io_list and frees the scan_io_t's. This is + * called when emptying queues, either when we're up against the memory + * limit or when we have finished scanning. Returns B_TRUE if we stopped + * processing the list before we finished. Any zios that were not issued + * will remain in the io_list. + */ +static boolean_t +scan_io_queue_issue(dsl_scan_io_queue_t *queue, list_t *io_list) +{ + dsl_scan_t *scn = queue->q_scn; + scan_io_t *sio; + int64_t bytes_issued = 0; + boolean_t suspended = B_FALSE; + + while ((sio = list_head(io_list)) != NULL) { + blkptr_t bp; + + if (scan_io_queue_check_suspend(scn)) { + suspended = B_TRUE; + break; + } + + sio2bp(sio, &bp, queue->q_vd->vdev_id); + bytes_issued += sio->sio_asize; + scan_exec_io(scn->scn_dp, &bp, sio->sio_flags, + &sio->sio_zb, queue); + (void) list_remove_head(io_list); + scan_io_queues_update_zio_stats(queue, &bp); + kmem_free(sio, sizeof (*sio)); + } + + atomic_add_64(&scn->scn_bytes_pending, -bytes_issued); + + return (suspended); +} + +/* + * Given a range_seg_t (extent) and a list, this function passes over a + * scan queue and gathers up the appropriate ios which fit into that + * scan seg (starting from lowest LBA). At the end, we remove the segment + * from the q_exts_by_addr range tree. + */ +static boolean_t +scan_io_queue_gather(dsl_scan_io_queue_t *queue, range_seg_t *rs, list_t *list) +{ + scan_io_t srch_sio, *sio, *next_sio; + avl_index_t idx; + uint_t num_sios = 0; + int64_t bytes_issued = 0; + + ASSERT(rs != NULL); + ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock)); + + srch_sio.sio_offset = rs->rs_start; + + /* + * The exact start of the extent might not contain any matching zios, + * so if that's the case, examine the next one in the tree. + */ + sio = avl_find(&queue->q_sios_by_addr, &srch_sio, &idx); + if (sio == NULL) + sio = avl_nearest(&queue->q_sios_by_addr, idx, AVL_AFTER); + + while (sio != NULL && sio->sio_offset < rs->rs_end && num_sios <= 32) { + ASSERT3U(sio->sio_offset, >=, rs->rs_start); + ASSERT3U(sio->sio_offset + sio->sio_asize, <=, rs->rs_end); + + next_sio = AVL_NEXT(&queue->q_sios_by_addr, sio); + avl_remove(&queue->q_sios_by_addr, sio); + + bytes_issued += sio->sio_asize; + num_sios++; + list_insert_tail(list, sio); + sio = next_sio; + } + + /* + * We limit the number of sios we process at once to 32 to avoid + * biting off more than we can chew. If we didn't take everything + * in the segment we update it to reflect the work we were able to + * complete. Otherwise, we remove it from the range tree entirely. + */ + if (sio != NULL && sio->sio_offset < rs->rs_end) { + range_tree_adjust_fill(queue->q_exts_by_addr, rs, + -bytes_issued); + range_tree_resize_segment(queue->q_exts_by_addr, rs, + sio->sio_offset, rs->rs_end - sio->sio_offset); + + return (B_TRUE); + } else { + range_tree_remove(queue->q_exts_by_addr, rs->rs_start, + rs->rs_end - rs->rs_start); + return (B_FALSE); + } +} + + +/* + * This is called from the queue emptying thread and selects the next + * extent from which we are to issue io's. The behavior of this function + * depends on the state of the scan, the current memory consumption and + * whether or not we are performing a scan shutdown. + * 1) We select extents in an elevator algorithm (LBA-order) if the scan + * needs to perform a checkpoint + * 2) We select the largest available extent if we are up against the + * memory limit. + * 3) Otherwise we don't select any extents. + */ +static const range_seg_t * +scan_io_queue_fetch_ext(dsl_scan_io_queue_t *queue) +{ + dsl_scan_t *scn = queue->q_scn; + + ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock)); + ASSERT(scn->scn_is_sorted); + + /* handle tunable overrides */ + if (scn->scn_checkpointing || scn->scn_clearing) { + if (zfs_scan_issue_strategy == 1) { + return (range_tree_first(queue->q_exts_by_addr)); + } else if (zfs_scan_issue_strategy == 2) { + return (avl_first(&queue->q_exts_by_size)); + } + } + + /* + * During normal clearing, we want to issue our largest segments + * first, keeping IO as sequential as possible, and leaving the + * smaller extents for later with the hope that they might eventually + * grow to larger sequential segments. However, when the scan is + * checkpointing, no new extents will be added to the sorting queue, + * so the way we are sorted now is as good as it will ever get. + * In this case, we instead switch to issuing extents in LBA order. + */ + if (scn->scn_checkpointing) { + return (range_tree_first(queue->q_exts_by_addr)); + } else if (scn->scn_clearing) { + return (avl_first(&queue->q_exts_by_size)); + } else { + return (NULL); + } +} + +static void +scan_io_queues_run_one(void *arg) +{ + dsl_scan_io_queue_t *queue = arg; + kmutex_t *q_lock = &queue->q_vd->vdev_scan_io_queue_lock; + boolean_t suspended = B_FALSE; + range_seg_t *rs = NULL; + scan_io_t *sio = NULL; + list_t sio_list; + uint64_t bytes_per_leaf = zfs_scan_vdev_limit; + uint64_t nr_leaves = dsl_scan_count_leaves(queue->q_vd); + + ASSERT(queue->q_scn->scn_is_sorted); + + list_create(&sio_list, sizeof (scan_io_t), + offsetof(scan_io_t, sio_nodes.sio_list_node)); + mutex_enter(q_lock); + + /* calculate maximum in-flight bytes for this txg (min 1MB) */ + queue->q_maxinflight_bytes = + MAX(nr_leaves * bytes_per_leaf, 1ULL << 20); + + /* reset per-queue scan statistics for this txg */ + queue->q_total_seg_size_this_txg = 0; + queue->q_segs_this_txg = 0; + queue->q_total_zio_size_this_txg = 0; + queue->q_zios_this_txg = 0; + + /* loop until we have run out of time or sios */ + while ((rs = (range_seg_t*)scan_io_queue_fetch_ext(queue)) != NULL) { + uint64_t seg_start = 0, seg_end = 0; + boolean_t more_left = B_TRUE; + + ASSERT(list_is_empty(&sio_list)); + + /* loop while we still have sios left to process in this rs */ + while (more_left) { + scan_io_t *first_sio, *last_sio; + + /* + * We have selected which extent needs to be + * processed next. Gather up the corresponding sios. + */ + more_left = scan_io_queue_gather(queue, rs, &sio_list); + ASSERT(!list_is_empty(&sio_list)); + first_sio = list_head(&sio_list); + last_sio = list_tail(&sio_list); + + seg_end = last_sio->sio_offset + last_sio->sio_asize; + if (seg_start == 0) + seg_start = first_sio->sio_offset; + + /* + * Issuing sios can take a long time so drop the + * queue lock. The sio queue won't be updated by + * other threads since we're in syncing context so + * we can be sure that our trees will remain exactly + * as we left them. + */ + mutex_exit(q_lock); + suspended = scan_io_queue_issue(queue, &sio_list); + mutex_enter(q_lock); + + if (suspended) + break; + } + /* update statistics for debugging purposes */ + scan_io_queues_update_seg_stats(queue, seg_start, seg_end); + + if (suspended) + break; + } + + + /* If we were suspended in the middle of processing, + * requeue any unfinished sios and exit. + */ + while ((sio = list_head(&sio_list)) != NULL) { + list_remove(&sio_list, sio); + scan_io_queue_insert_impl(queue, sio); + } + + mutex_exit(q_lock); + list_destroy(&sio_list); +} + +/* + * Performs an emptying run on all scan queues in the pool. This just + * punches out one thread per top-level vdev, each of which processes + * only that vdev's scan queue. We can parallelize the I/O here because + * we know that each queue's io's only affect its own top-level vdev. + * + * This function waits for the queue runs to complete, and must be + * called from dsl_scan_sync (or in general, syncing context). + */ +static void +scan_io_queues_run(dsl_scan_t *scn) +{ + spa_t *spa = scn->scn_dp->dp_spa; + + ASSERT(scn->scn_is_sorted); + ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); + + if (scn->scn_bytes_pending == 0) + return; + + if (scn->scn_taskq == NULL) { + char *tq_name = kmem_zalloc(ZFS_MAX_DATASET_NAME_LEN + 16, + KM_SLEEP); + int nthreads = spa->spa_root_vdev->vdev_children; + + /* + * We need to make this taskq *always* execute as many + * threads in parallel as we have top-level vdevs and no + * less, otherwise strange serialization of the calls to + * scan_io_queues_run_one can occur during spa_sync runs + * and that significantly impacts performance. + */ + (void) snprintf(tq_name, ZFS_MAX_DATASET_NAME_LEN + 16, + "dsl_scan_tq_%s", spa->spa_name); + scn->scn_taskq = taskq_create(tq_name, nthreads, minclsyspri, + nthreads, nthreads, TASKQ_PREPOPULATE); + kmem_free(tq_name, ZFS_MAX_DATASET_NAME_LEN + 16); + } + + for (uint64_t i = 0; i < spa->spa_root_vdev->vdev_children; i++) { + vdev_t *vd = spa->spa_root_vdev->vdev_child[i]; + + mutex_enter(&vd->vdev_scan_io_queue_lock); + if (vd->vdev_scan_io_queue != NULL) { + VERIFY(taskq_dispatch(scn->scn_taskq, + scan_io_queues_run_one, vd->vdev_scan_io_queue, + TQ_SLEEP) != TASKQID_INVALID); + } + mutex_exit(&vd->vdev_scan_io_queue_lock); + } + + /* + * Wait for the queues to finish issuing thir IOs for this run + * before we return. There may still be IOs in flight at this + * point. + */ + taskq_wait(scn->scn_taskq); +} + +static boolean_t dsl_scan_async_block_should_pause(dsl_scan_t *scn) { uint64_t elapsed_nanosecs; @@ -1581,6 +2894,41 @@ return (0); } +static void +dsl_scan_update_stats(dsl_scan_t *scn) +{ + spa_t *spa = scn->scn_dp->dp_spa; + uint64_t i; + uint64_t seg_size_total = 0, zio_size_total = 0; + uint64_t seg_count_total = 0, zio_count_total = 0; + + for (i = 0; i < spa->spa_root_vdev->vdev_children; i++) { + vdev_t *vd = spa->spa_root_vdev->vdev_child[i]; + dsl_scan_io_queue_t *queue = vd->vdev_scan_io_queue; + + if (queue == NULL) + continue; + + seg_size_total += queue->q_total_seg_size_this_txg; + zio_size_total += queue->q_total_zio_size_this_txg; + seg_count_total += queue->q_segs_this_txg; + zio_count_total += queue->q_zios_this_txg; + } + + if (seg_count_total == 0 || zio_count_total == 0) { + scn->scn_avg_seg_size_this_txg = 0; + scn->scn_avg_zio_size_this_txg = 0; + scn->scn_segs_this_txg = 0; + scn->scn_zios_this_txg = 0; + return; + } + + scn->scn_avg_seg_size_this_txg = seg_size_total / seg_count_total; + scn->scn_avg_zio_size_this_txg = zio_size_total / zio_count_total; + scn->scn_segs_this_txg = seg_count_total; + scn->scn_zios_this_txg = zio_count_total; +} + boolean_t dsl_scan_active(dsl_scan_t *scn) { @@ -1591,8 +2939,7 @@ return (B_FALSE); if (spa_shutting_down(spa)) return (B_FALSE); - if ((scn->scn_phys.scn_state == DSS_SCANNING && - !dsl_scan_is_paused_scrub(scn)) || + if ((dsl_scan_is_running(scn) && !dsl_scan_is_paused_scrub(scn)) || (scn->scn_async_destroying && !scn->scn_async_stalled)) return (B_TRUE); @@ -1614,14 +2961,15 @@ return (0); if (zfs_free_bpobj_enabled && - spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) { + spa_version(spa) >= SPA_VERSION_DEADLISTS) { scn->scn_is_bptree = B_FALSE; scn->scn_async_block_min_time_ms = zfs_free_min_time_ms; - scn->scn_zio_root = zio_root(dp->dp_spa, NULL, + scn->scn_zio_root = zio_root(spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); err = bpobj_iterate(&dp->dp_free_bpobj, dsl_scan_free_block_cb, scn, tx); - VERIFY3U(0, ==, zio_wait(scn->scn_zio_root)); + VERIFY0(zio_wait(scn->scn_zio_root)); + scn->scn_zio_root = NULL; if (err != 0 && err != ERESTART) zfs_panic_recover("error %u from bpobj_iterate()", err); @@ -1630,11 +2978,12 @@ if (err == 0 && spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) { ASSERT(scn->scn_async_destroying); scn->scn_is_bptree = B_TRUE; - scn->scn_zio_root = zio_root(dp->dp_spa, NULL, + scn->scn_zio_root = zio_root(spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); err = bptree_iterate(dp->dp_meta_objset, dp->dp_bptree_obj, B_TRUE, dsl_scan_free_block_cb, scn, tx); VERIFY0(zio_wait(scn->scn_zio_root)); + scn->scn_zio_root = NULL; if (err == EIO || err == ECKSUM) { err = 0; @@ -1743,12 +3092,60 @@ return (0); } +static boolean_t +dsl_scan_need_resilver(spa_t *spa, const dva_t *dva, size_t psize, + uint64_t phys_birth) +{ + vdev_t *vd; + + if (DVA_GET_GANG(dva)) { + /* + * Gang members may be spread across multiple + * vdevs, so the best estimate we have is the + * scrub range, which has already been checked. + * XXX -- it would be better to change our + * allocation policy to ensure that all + * gang members reside on the same vdev. + */ + return (B_TRUE); + } + + vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva)); + + /* + * Check if the txg falls within the range which must be + * resilvered. DVAs outside this range can always be skipped. + */ + if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1)) + return (B_FALSE); + + /* + * Check if the top-level vdev must resilver this offset. + * When the offset does not intersect with a dirty leaf DTL + * then it may be possible to skip the resilver IO. The psize + * is provided instead of asize to simplify the check for RAIDZ. + */ + if (!vdev_dtl_need_resilver(vd, DVA_GET_OFFSET(dva), psize)) + return (B_FALSE); + + return (B_TRUE); +} + +/* + * This is the primary entry point for scans that is called from syncing + * context. Scans must happen entirely during syncing context so that we + * cna guarantee that blocks we are currently scanning will not change out + * from under us. While a scan is active, this funciton controls how quickly + * transaction groups proceed, instead of the normal handling provided by + * txg_sync_thread(). + */ void dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) { + int err = 0; dsl_scan_t *scn = dp->dp_scan; spa_t *spa = dp->dp_spa; - int err = 0; + state_sync_type_t sync_type = SYNC_OPTIONAL; /* * Check for scn_restart_txg before checking spa_load_state, so @@ -1761,7 +3158,7 @@ if (vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) func = POOL_SCAN_RESILVER; zfs_dbgmsg("restarting scan func=%u txg=%llu", - func, tx->tx_txg); + func, (longlong_t)tx->tx_txg); dsl_scan_setup_sync(&func, tx); } @@ -1785,7 +3182,17 @@ if (!scn->scn_async_stalled && !dsl_scan_active(scn)) return; + /* reset scan statistics */ scn->scn_visited_this_txg = 0; + scn->scn_holes_this_txg = 0; + scn->scn_lt_min_this_txg = 0; + scn->scn_gt_max_this_txg = 0; + scn->scn_ddt_contained_this_txg = 0; + scn->scn_objsets_visited_this_txg = 0; + scn->scn_avg_seg_size_this_txg = 0; + scn->scn_segs_this_txg = 0; + scn->scn_avg_zio_size_this_txg = 0; + scn->scn_zios_this_txg = 0; scn->scn_suspending = B_FALSE; scn->scn_sync_start_time = gethrtime(); spa->spa_scrub_active = B_TRUE; @@ -1801,110 +3208,189 @@ if (err != 0) return; - if (scn->scn_phys.scn_state != DSS_SCANNING) + if (!dsl_scan_is_running(scn) || dsl_scan_is_paused_scrub(scn)) return; - if (scn->scn_done_txg == tx->tx_txg) { - ASSERT(!scn->scn_suspending); - /* finished with scan. */ - zfs_dbgmsg("txg %llu scan complete", tx->tx_txg); - dsl_scan_done(scn, B_TRUE, tx); - ASSERT3U(spa->spa_scrub_inflight, ==, 0); - dsl_scan_sync_state(scn, tx); + /* + * Wait a few txgs after importing to begin scanning so that + * we can get the pool imported quickly. + */ + if (spa->spa_syncing_txg < spa->spa_first_txg + SCAN_IMPORT_WAIT_TXGS) return; + + /* + * It is possible to switch from unsorted to sorted at any time, + * but afterwards the scan will remain sorted unless reloaded from + * a checkpoint after a reboot. + */ + if (!zfs_scan_legacy) { + scn->scn_is_sorted = B_TRUE; + if (scn->scn_last_checkpoint == 0) + scn->scn_last_checkpoint = ddi_get_lbolt(); } - if (dsl_scan_is_paused_scrub(scn)) - return; + /* + * For sorted scans, determine what kind of work we will be doing + * this txg based on our memory limitations and whether or not we + * need to perform a checkpoint. + */ + if (scn->scn_is_sorted) { + /* + * If we are over our checkpoint interval, set scn_clearing + * so that we can begin checkpointing immediately. The + * checkpoint allows us to save a consisent bookmark + * representing how much data we have scrubbed so far. + * Otherwise, use the memory limit to determine if we should + * scan for metadata or start issue scrub IOs. We accumulate + * metadata until we hit our hard memory limit at which point + * we issue scrub IOs until we are at our soft memory limit. + */ + if (scn->scn_checkpointing || + ddi_get_lbolt() - scn->scn_last_checkpoint > + SEC_TO_TICK(zfs_scan_checkpoint_intval)) { + if (!scn->scn_checkpointing) + zfs_dbgmsg("begin scan checkpoint"); - if (scn->scn_phys.scn_ddt_bookmark.ddb_class <= - scn->scn_phys.scn_ddt_class_max) { - zfs_dbgmsg("doing scan sync txg %llu; " - "ddt bm=%llu/%llu/%llu/%llx", - (longlong_t)tx->tx_txg, - (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_class, - (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_type, - (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_checksum, - (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_cursor); - ASSERT(scn->scn_phys.scn_bookmark.zb_objset == 0); - ASSERT(scn->scn_phys.scn_bookmark.zb_object == 0); - ASSERT(scn->scn_phys.scn_bookmark.zb_level == 0); - ASSERT(scn->scn_phys.scn_bookmark.zb_blkid == 0); + scn->scn_checkpointing = B_TRUE; + scn->scn_clearing = B_TRUE; + } else { + boolean_t should_clear = dsl_scan_should_clear(scn); + if (should_clear && !scn->scn_clearing) { + zfs_dbgmsg("begin scan clearing"); + scn->scn_clearing = B_TRUE; + } else if (!should_clear && scn->scn_clearing) { + zfs_dbgmsg("finish scan clearing"); + scn->scn_clearing = B_FALSE; + } + } } else { - zfs_dbgmsg("doing scan sync txg %llu; bm=%llu/%llu/%llu/%llu", - (longlong_t)tx->tx_txg, - (longlong_t)scn->scn_phys.scn_bookmark.zb_objset, - (longlong_t)scn->scn_phys.scn_bookmark.zb_object, - (longlong_t)scn->scn_phys.scn_bookmark.zb_level, - (longlong_t)scn->scn_phys.scn_bookmark.zb_blkid); + ASSERT0(scn->scn_checkpointing); + ASSERT0(scn->scn_clearing); } - scn->scn_zio_root = zio_root(dp->dp_spa, NULL, - NULL, ZIO_FLAG_CANFAIL); - dsl_pool_config_enter(dp, FTAG); - dsl_scan_visit(scn, tx); - dsl_pool_config_exit(dp, FTAG); - (void) zio_wait(scn->scn_zio_root); - scn->scn_zio_root = NULL; + if (!scn->scn_clearing && scn->scn_done_txg == 0) { + /* Need to scan metadata for more blocks to scrub */ + dsl_scan_phys_t *scnp = &scn->scn_phys; + taskqid_t prefetch_tqid; + uint64_t bytes_per_leaf = zfs_scan_vdev_limit; + uint64_t nr_leaves = dsl_scan_count_leaves(spa->spa_root_vdev); - zfs_dbgmsg("visited %llu blocks in %llums", - (longlong_t)scn->scn_visited_this_txg, - (longlong_t)NSEC2MSEC(gethrtime() - scn->scn_sync_start_time)); + /* + * Calculate the max number of in-flight bytes for pool-wide + * scanning operations (minimum 1MB). Limits for the issuing + * phase are done per top-level vdev and are handled separately. + */ + scn->scn_maxinflight_bytes = + MAX(nr_leaves * bytes_per_leaf, 1ULL << 20); - if (!scn->scn_suspending) { - scn->scn_done_txg = tx->tx_txg + 1; - zfs_dbgmsg("txg %llu traversal complete, waiting till txg %llu", - tx->tx_txg, scn->scn_done_txg); - } - - if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) { - mutex_enter(&spa->spa_scrub_lock); - while (spa->spa_scrub_inflight > 0) { - cv_wait(&spa->spa_scrub_io_cv, - &spa->spa_scrub_lock); + if (scnp->scn_ddt_bookmark.ddb_class <= + scnp->scn_ddt_class_max) { + ASSERT(ZB_IS_ZERO(&scnp->scn_bookmark)); + zfs_dbgmsg("doing scan sync txg %llu; " + "ddt bm=%llu/%llu/%llu/%llx", + (longlong_t)tx->tx_txg, + (longlong_t)scnp->scn_ddt_bookmark.ddb_class, + (longlong_t)scnp->scn_ddt_bookmark.ddb_type, + (longlong_t)scnp->scn_ddt_bookmark.ddb_checksum, + (longlong_t)scnp->scn_ddt_bookmark.ddb_cursor); + } else { + zfs_dbgmsg("doing scan sync txg %llu; " + "bm=%llu/%llu/%llu/%llu", + (longlong_t)tx->tx_txg, + (longlong_t)scnp->scn_bookmark.zb_objset, + (longlong_t)scnp->scn_bookmark.zb_object, + (longlong_t)scnp->scn_bookmark.zb_level, + (longlong_t)scnp->scn_bookmark.zb_blkid); } - mutex_exit(&spa->spa_scrub_lock); - } - dsl_scan_sync_state(scn, tx); -} + scn->scn_zio_root = zio_root(dp->dp_spa, NULL, + NULL, ZIO_FLAG_CANFAIL); -/* - * This will start a new scan, or restart an existing one. - */ -void -dsl_resilver_restart(dsl_pool_t *dp, uint64_t txg) -{ - if (txg == 0) { - dmu_tx_t *tx; - tx = dmu_tx_create_dd(dp->dp_mos_dir); - VERIFY(0 == dmu_tx_assign(tx, TXG_WAIT)); + scn->scn_prefetch_stop = B_FALSE; + prefetch_tqid = taskq_dispatch(dp->dp_sync_taskq, + dsl_scan_prefetch_thread, scn, TQ_SLEEP); + ASSERT(prefetch_tqid != TASKQID_INVALID); - txg = dmu_tx_get_txg(tx); - dp->dp_scan->scn_restart_txg = txg; - dmu_tx_commit(tx); - } else { - dp->dp_scan->scn_restart_txg = txg; + dsl_pool_config_enter(dp, FTAG); + dsl_scan_visit(scn, tx); + dsl_pool_config_exit(dp, FTAG); + + mutex_enter(&dp->dp_spa->spa_scrub_lock); + scn->scn_prefetch_stop = B_TRUE; + cv_broadcast(&spa->spa_scrub_io_cv); + mutex_exit(&dp->dp_spa->spa_scrub_lock); + + taskq_wait_id(dp->dp_sync_taskq, prefetch_tqid); + (void) zio_wait(scn->scn_zio_root); + scn->scn_zio_root = NULL; + + zfs_dbgmsg("scan visited %llu blocks in %llums " + "(%llu os's, %llu holes, %llu < mintxg, " + "%llu in ddt, %llu > maxtxg)", + (longlong_t)scn->scn_visited_this_txg, + (longlong_t)NSEC2MSEC(gethrtime() - + scn->scn_sync_start_time), + (longlong_t)scn->scn_objsets_visited_this_txg, + (longlong_t)scn->scn_holes_this_txg, + (longlong_t)scn->scn_lt_min_this_txg, + (longlong_t)scn->scn_ddt_contained_this_txg, + (longlong_t)scn->scn_gt_max_this_txg); + + if (!scn->scn_suspending) { + ASSERT0(avl_numnodes(&scn->scn_queue)); + scn->scn_done_txg = tx->tx_txg + 1; + if (scn->scn_is_sorted) { + scn->scn_checkpointing = B_TRUE; + scn->scn_clearing = B_TRUE; + } + zfs_dbgmsg("scan complete txg %llu", + (longlong_t)tx->tx_txg); + } + } else if (scn->scn_is_sorted && scn->scn_bytes_pending != 0) { + /* need to issue scrubbing IOs from per-vdev queues */ + scn->scn_zio_root = zio_root(dp->dp_spa, NULL, + NULL, ZIO_FLAG_CANFAIL); + scan_io_queues_run(scn); + (void) zio_wait(scn->scn_zio_root); + scn->scn_zio_root = NULL; + + /* calculate and dprintf the current memory usage */ + (void) dsl_scan_should_clear(scn); + dsl_scan_update_stats(scn); + + zfs_dbgmsg("scrubbed %llu blocks (%llu segs) in %llums " + "(avg_block_size = %llu, avg_seg_size = %llu)", + (longlong_t)scn->scn_zios_this_txg, + (longlong_t)scn->scn_segs_this_txg, + (longlong_t)NSEC2MSEC(gethrtime() - + scn->scn_sync_start_time), + (longlong_t)scn->scn_avg_zio_size_this_txg, + (longlong_t)scn->scn_avg_seg_size_this_txg); + } else if (scn->scn_done_txg != 0 && scn->scn_done_txg <= tx->tx_txg) { + /* Finished with everything. Mark the scrub as complete */ + zfs_dbgmsg("scan issuing complete txg %llu", + (longlong_t)tx->tx_txg); + ASSERT3U(scn->scn_done_txg, !=, 0); + ASSERT0(spa->spa_scrub_inflight); + ASSERT0(scn->scn_bytes_pending); + dsl_scan_done(scn, B_TRUE, tx); + sync_type = SYNC_MANDATORY; } - zfs_dbgmsg("restarting resilver txg=%llu", txg); -} -boolean_t -dsl_scan_resilvering(dsl_pool_t *dp) -{ - return (dp->dp_scan->scn_phys.scn_state == DSS_SCANNING && - dp->dp_scan->scn_phys.scn_func == POOL_SCAN_RESILVER); + dsl_scan_sync_state(scn, tx, sync_type); } -/* - * scrub consumers - */ - static void -count_block(zfs_all_blkstats_t *zab, const blkptr_t *bp) +count_block(dsl_scan_t *scn, zfs_all_blkstats_t *zab, const blkptr_t *bp) { int i; + /* update the spa's stats on how many bytes we have issued */ + for (i = 0; i < BP_GET_NDVAS(bp); i++) { + atomic_add_64(&scn->scn_dp->dp_spa->spa_scan_pass_issued, + DVA_GET_ASIZE(&bp->blk_dva[i])); + } + /* * If we resume after a reboot, zab will be NULL; don't record * incomplete stats in that case. @@ -1912,6 +3398,8 @@ if (zab == NULL) return; + mutex_enter(&zab->zab_lock); + for (i = 0; i < 4; i++) { int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS; int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL; @@ -1946,24 +3434,96 @@ break; } } + + mutex_exit(&zab->zab_lock); } static void -dsl_scan_scrub_done(zio_t *zio) +scan_io_queue_insert_impl(dsl_scan_io_queue_t *queue, scan_io_t *sio) { - spa_t *spa = zio->io_spa; + avl_index_t idx; + int64_t asize = sio->sio_asize; + dsl_scan_t *scn = queue->q_scn; - abd_free(zio->io_abd); + ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock)); - mutex_enter(&spa->spa_scrub_lock); - spa->spa_scrub_inflight--; - cv_broadcast(&spa->spa_scrub_io_cv); + if (avl_find(&queue->q_sios_by_addr, sio, &idx) != NULL) { + /* block is already scheduled for reading */ + atomic_add_64(&scn->scn_bytes_pending, -asize); + kmem_free(sio, sizeof (*sio)); + return; + } + avl_insert(&queue->q_sios_by_addr, sio, idx); + range_tree_add(queue->q_exts_by_addr, sio->sio_offset, asize); +} - if (zio->io_error && (zio->io_error != ECKSUM || - !(zio->io_flags & ZIO_FLAG_SPECULATIVE))) { - spa->spa_dsl_pool->dp_scan->scn_phys.scn_errors++; +/* + * Given all the info we got from our metadata scanning process, we + * construct a scan_io_t and insert it into the scan sorting queue. The + * I/O must already be suitable for us to process. This is controlled + * by dsl_scan_enqueue(). + */ +static void +scan_io_queue_insert(dsl_scan_io_queue_t *queue, const blkptr_t *bp, int dva_i, + int zio_flags, const zbookmark_phys_t *zb) +{ + dsl_scan_t *scn = queue->q_scn; + scan_io_t *sio = kmem_zalloc(sizeof (*sio), KM_SLEEP); + + ASSERT0(BP_IS_GANG(bp)); + ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock)); + + bp2sio(bp, sio, dva_i); + sio->sio_flags = zio_flags; + sio->sio_zb = *zb; + + /* + * Increment the bytes pending counter now so that we can't + * get an integer underflow in case the worker processes the + * zio before we get to incrementing this counter. + */ + atomic_add_64(&scn->scn_bytes_pending, sio->sio_asize); + + scan_io_queue_insert_impl(queue, sio); +} + +/* + * Given a set of I/O parameters as discovered by the metadata traversal + * process, attempts to place the I/O into the sorted queues (if allowed), + * or immediately executes the I/O. + */ +static void +dsl_scan_enqueue(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags, + const zbookmark_phys_t *zb) +{ + spa_t *spa = dp->dp_spa; + + ASSERT(!BP_IS_EMBEDDED(bp)); + + /* + * Gang blocks are hard to issue sequentially, so we just issue them + * here immediately instead of queuing them. + */ + if (!dp->dp_scan->scn_is_sorted || BP_IS_GANG(bp)) { + scan_exec_io(dp, bp, zio_flags, zb, NULL); + return; } - mutex_exit(&spa->spa_scrub_lock); + for (int i = 0; i < BP_GET_NDVAS(bp); i++) { + dva_t dva; + vdev_t *vdev; + + dva = bp->blk_dva[i]; + vdev = vdev_lookup_top(spa, DVA_GET_VDEV(&dva)); + ASSERT(vdev != NULL); + + mutex_enter(&vdev->vdev_scan_io_queue_lock); + if (vdev->vdev_scan_io_queue == NULL) + vdev->vdev_scan_io_queue = scan_io_queue_create(vdev); + ASSERT(dp->dp_scan != NULL); + scan_io_queue_insert(vdev->vdev_scan_io_queue, bp, + i, zio_flags, zb); + mutex_exit(&vdev->vdev_scan_io_queue_lock); + } } static int @@ -1971,137 +3531,390 @@ const blkptr_t *bp, const zbookmark_phys_t *zb) { dsl_scan_t *scn = dp->dp_scan; - size_t size = BP_GET_PSIZE(bp); spa_t *spa = dp->dp_spa; uint64_t phys_birth = BP_PHYSICAL_BIRTH(bp); + size_t psize = BP_GET_PSIZE(bp); boolean_t needs_io; int zio_flags = ZIO_FLAG_SCAN_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL; - unsigned int scan_delay = 0; - + int d; + if (phys_birth <= scn->scn_phys.scn_min_txg || phys_birth >= scn->scn_phys.scn_max_txg) return (0); - count_block(dp->dp_blkstats, bp); - - if (BP_IS_EMBEDDED(bp)) + if (BP_IS_EMBEDDED(bp)) { + count_block(scn, dp->dp_blkstats, bp); return (0); + } ASSERT(DSL_SCAN_IS_SCRUB_RESILVER(scn)); if (scn->scn_phys.scn_func == POOL_SCAN_SCRUB) { zio_flags |= ZIO_FLAG_SCRUB; needs_io = B_TRUE; - scan_delay = zfs_scrub_delay; } else { ASSERT3U(scn->scn_phys.scn_func, ==, POOL_SCAN_RESILVER); zio_flags |= ZIO_FLAG_RESILVER; needs_io = B_FALSE; - scan_delay = zfs_resilver_delay; } /* If it's an intent log block, failure is expected. */ if (zb->zb_level == ZB_ZIL_LEVEL) zio_flags |= ZIO_FLAG_SPECULATIVE; - for (int d = 0; d < BP_GET_NDVAS(bp); d++) { - vdev_t *vd = vdev_lookup_top(spa, - DVA_GET_VDEV(&bp->blk_dva[d])); + for (d = 0; d < BP_GET_NDVAS(bp); d++) { + const dva_t *dva = &bp->blk_dva[d]; /* * Keep track of how much data we've examined so that * zpool(1M) status can make useful progress reports. */ - scn->scn_phys.scn_examined += DVA_GET_ASIZE(&bp->blk_dva[d]); - spa->spa_scan_pass_exam += DVA_GET_ASIZE(&bp->blk_dva[d]); + scn->scn_phys.scn_examined += DVA_GET_ASIZE(dva); + spa->spa_scan_pass_exam += DVA_GET_ASIZE(dva); /* if it's a resilver, this may not be in the target range */ - if (!needs_io) { - if (DVA_GET_GANG(&bp->blk_dva[d])) { - /* - * Gang members may be spread across multiple - * vdevs, so the best estimate we have is the - * scrub range, which has already been checked. - * XXX -- it would be better to change our - * allocation policy to ensure that all - * gang members reside on the same vdev. - */ - needs_io = B_TRUE; - } else { - needs_io = vdev_dtl_contains(vd, DTL_PARTIAL, - phys_birth, 1); - } - } + if (!needs_io) + needs_io = dsl_scan_need_resilver(spa, dva, psize, + phys_birth); } if (needs_io && !zfs_no_scrub_io) { - vdev_t *rvd = spa->spa_root_vdev; - uint64_t maxinflight = rvd->vdev_children * - MAX(zfs_top_maxinflight, 1); + dsl_scan_enqueue(dp, bp, zio_flags, zb); + } else { + count_block(scn, dp->dp_blkstats, bp); + } + /* do not relocate this block */ + return (0); +} + +static void +dsl_scan_scrub_done(zio_t *zio) +{ + spa_t *spa = zio->io_spa; + blkptr_t *bp = zio->io_bp; + dsl_scan_io_queue_t *queue = zio->io_private; + + abd_free(zio->io_abd); + + if (queue == NULL) { mutex_enter(&spa->spa_scrub_lock); - while (spa->spa_scrub_inflight >= maxinflight) + ASSERT3U(spa->spa_scrub_inflight, >=, BP_GET_PSIZE(bp)); + spa->spa_scrub_inflight -= BP_GET_PSIZE(bp); + cv_broadcast(&spa->spa_scrub_io_cv); + mutex_exit(&spa->spa_scrub_lock); + } else { + mutex_enter(&queue->q_vd->vdev_scan_io_queue_lock); + ASSERT3U(queue->q_inflight_bytes, >=, BP_GET_PSIZE(bp)); + queue->q_inflight_bytes -= BP_GET_PSIZE(bp); + cv_broadcast(&queue->q_zio_cv); + mutex_exit(&queue->q_vd->vdev_scan_io_queue_lock); + } + + if (zio->io_error && (zio->io_error != ECKSUM || + !(zio->io_flags & ZIO_FLAG_SPECULATIVE))) { + atomic_inc_64(&spa->spa_dsl_pool->dp_scan->scn_phys.scn_errors); + } +} + +/* + * Given a scanning zio's information, executes the zio. The zio need + * not necessarily be only sortable, this function simply executes the + * zio, no matter what it is. The optional queue argument allows the + * caller to specify that they want per top level vdev IO rate limiting + * instead of the legacy global limiting. + */ +static void +scan_exec_io(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags, + const zbookmark_phys_t *zb, dsl_scan_io_queue_t *queue) +{ + spa_t *spa = dp->dp_spa; + dsl_scan_t *scn = dp->dp_scan; + size_t size = BP_GET_PSIZE(bp); + abd_t *data = abd_alloc_for_io(size, B_FALSE); + unsigned int scan_delay = 0; + + if (queue == NULL) { + mutex_enter(&spa->spa_scrub_lock); + while (spa->spa_scrub_inflight >= scn->scn_maxinflight_bytes) cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); - spa->spa_scrub_inflight++; + spa->spa_scrub_inflight += BP_GET_PSIZE(bp); mutex_exit(&spa->spa_scrub_lock); + } else { + kmutex_t *q_lock = &queue->q_vd->vdev_scan_io_queue_lock; - /* - * If we're seeing recent (zfs_scan_idle) "important" I/Os - * then throttle our workload to limit the impact of a scan. - */ - if (ddi_get_lbolt64() - spa->spa_last_io <= zfs_scan_idle) - delay(MAX((int)scan_delay, 0)); + mutex_enter(q_lock); + while (queue->q_inflight_bytes >= queue->q_maxinflight_bytes) + cv_wait(&queue->q_zio_cv, q_lock); + queue->q_inflight_bytes += BP_GET_PSIZE(bp); + mutex_exit(q_lock); + } - zio_nowait(zio_read(NULL, spa, bp, - abd_alloc_for_io(size, B_FALSE), size, dsl_scan_scrub_done, - NULL, ZIO_PRIORITY_SCRUB, zio_flags, zb)); + if (zio_flags & ZIO_FLAG_RESILVER) + scan_delay = zfs_resilver_delay; + else { + ASSERT(zio_flags & ZIO_FLAG_SCRUB); + scan_delay = zfs_scrub_delay; } - /* do not relocate this block */ - return (0); + if (scan_delay && (ddi_get_lbolt64() - spa->spa_last_io <= zfs_scan_idle)) + delay(MAX((int)scan_delay, 0)); + + count_block(dp->dp_scan, dp->dp_blkstats, bp); + zio_nowait(zio_read(dp->dp_scan->scn_zio_root, spa, bp, data, size, + dsl_scan_scrub_done, queue, ZIO_PRIORITY_SCRUB, zio_flags, zb)); } /* - * Called by the ZFS_IOC_POOL_SCAN ioctl to start a scrub or resilver. - * Can also be called to resume a paused scrub. + * This is the primary extent sorting algorithm. We balance two parameters: + * 1) how many bytes of I/O are in an extent + * 2) how well the extent is filled with I/O (as a fraction of its total size) + * Since we allow extents to have gaps between their constituent I/Os, it's + * possible to have a fairly large extent that contains the same amount of + * I/O bytes than a much smaller extent, which just packs the I/O more tightly. + * The algorithm sorts based on a score calculated from the extent's size, + * the relative fill volume (in %) and a "fill weight" parameter that controls + * the split between whether we prefer larger extents or more well populated + * extents: + * + * SCORE = FILL_IN_BYTES + (FILL_IN_PERCENT * FILL_IN_BYTES * FILL_WEIGHT) + * + * Example: + * 1) assume extsz = 64 MiB + * 2) assume fill = 32 MiB (extent is half full) + * 3) assume fill_weight = 3 + * 4) SCORE = 32M + (((32M * 100) / 64M) * 3 * 32M) / 100 + * SCORE = 32M + (50 * 3 * 32M) / 100 + * SCORE = 32M + (4800M / 100) + * SCORE = 32M + 48M + * ^ ^ + * | +--- final total relative fill-based score + * +--------- final total fill-based score + * SCORE = 80M + * + * As can be seen, at fill_ratio=3, the algorithm is slightly biased towards + * extents that are more completely filled (in a 3:2 ratio) vs just larger. + * Note that as an optimization, we replace multiplication and division by + * 100 with bitshifting by 7 (which effecitvely multiplies and divides by 128). */ -int -dsl_scan(dsl_pool_t *dp, pool_scan_func_t func) +static int +ext_size_compare(const void *x, const void *y) { - spa_t *spa = dp->dp_spa; + const range_seg_t *rsa = x, *rsb = y; + uint64_t sa = rsa->rs_end - rsa->rs_start, + sb = rsb->rs_end - rsb->rs_start; + uint64_t score_a, score_b; + + score_a = rsa->rs_fill + ((((rsa->rs_fill << 7) / sa) * + fill_weight * rsa->rs_fill) >> 7); + score_b = rsb->rs_fill + ((((rsb->rs_fill << 7) / sb) * + fill_weight * rsb->rs_fill) >> 7); + + if (score_a > score_b) + return (-1); + if (score_a == score_b) { + if (rsa->rs_start < rsb->rs_start) + return (-1); + if (rsa->rs_start == rsb->rs_start) + return (0); + return (1); + } + return (1); +} + +/* + * Comparator for the q_sios_by_addr tree. Sorting is simply performed + * based on LBA-order (from lowest to highest). + */ +static int +io_addr_compare(const void *x, const void *y) +{ + const scan_io_t *a = x, *b = y; + + if (a->sio_offset < b->sio_offset) + return (-1); + if (a->sio_offset == b->sio_offset) + return (0); + return (1); +} + +/* IO queues are created on demand when they are needed. */ +static dsl_scan_io_queue_t * +scan_io_queue_create(vdev_t *vd) +{ + dsl_scan_t *scn = vd->vdev_spa->spa_dsl_pool->dp_scan; + dsl_scan_io_queue_t *q = kmem_zalloc(sizeof (*q), KM_SLEEP); + + q->q_scn = scn; + q->q_vd = vd; + cv_init(&q->q_zio_cv, NULL, CV_DEFAULT, NULL); + q->q_exts_by_addr = range_tree_create_impl(&rt_avl_ops, + &q->q_exts_by_size, ext_size_compare, zfs_scan_max_ext_gap); + avl_create(&q->q_sios_by_addr, io_addr_compare, + sizeof (scan_io_t), offsetof(scan_io_t, sio_nodes.sio_addr_node)); + + return (q); +} + +/* + * Destroys a scan queue and all segments and scan_io_t's contained in it. + * No further execution of I/O occurs, anything pending in the queue is + * simply freed without being executed. + */ +void +dsl_scan_io_queue_destroy(dsl_scan_io_queue_t *queue) +{ + dsl_scan_t *scn = queue->q_scn; + scan_io_t *sio; + void *cookie = NULL; + int64_t bytes_dequeued = 0; + + ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock)); + + while ((sio = avl_destroy_nodes(&queue->q_sios_by_addr, &cookie)) != + NULL) { + ASSERT(range_tree_contains(queue->q_exts_by_addr, + sio->sio_offset, sio->sio_asize)); + bytes_dequeued += sio->sio_asize; + kmem_free(sio, sizeof (*sio)); + } + + atomic_add_64(&scn->scn_bytes_pending, -bytes_dequeued); + range_tree_vacate(queue->q_exts_by_addr, NULL, queue); + range_tree_destroy(queue->q_exts_by_addr); + avl_destroy(&queue->q_sios_by_addr); + cv_destroy(&queue->q_zio_cv); + + kmem_free(queue, sizeof (*queue)); +} + +/* + * Properly transfers a dsl_scan_queue_t from `svd' to `tvd'. This is + * called on behalf of vdev_top_transfer when creating or destroying + * a mirror vdev due to zpool attach/detach. + */ +void +dsl_scan_io_queue_vdev_xfer(vdev_t *svd, vdev_t *tvd) +{ + mutex_enter(&svd->vdev_scan_io_queue_lock); + mutex_enter(&tvd->vdev_scan_io_queue_lock); + + VERIFY3P(tvd->vdev_scan_io_queue, ==, NULL); + tvd->vdev_scan_io_queue = svd->vdev_scan_io_queue; + svd->vdev_scan_io_queue = NULL; + if (tvd->vdev_scan_io_queue != NULL) + tvd->vdev_scan_io_queue->q_vd = tvd; + + mutex_exit(&tvd->vdev_scan_io_queue_lock); + mutex_exit(&svd->vdev_scan_io_queue_lock); +} + +static void +scan_io_queues_destroy(dsl_scan_t *scn) +{ + vdev_t *rvd = scn->scn_dp->dp_spa->spa_root_vdev; + + for (uint64_t i = 0; i < rvd->vdev_children; i++) { + vdev_t *tvd = rvd->vdev_child[i]; + + mutex_enter(&tvd->vdev_scan_io_queue_lock); + if (tvd->vdev_scan_io_queue != NULL) + dsl_scan_io_queue_destroy(tvd->vdev_scan_io_queue); + tvd->vdev_scan_io_queue = NULL; + mutex_exit(&tvd->vdev_scan_io_queue_lock); + } +} + +static void +dsl_scan_freed_dva(spa_t *spa, const blkptr_t *bp, int dva_i) +{ + dsl_pool_t *dp = spa->spa_dsl_pool; dsl_scan_t *scn = dp->dp_scan; + vdev_t *vdev; + kmutex_t *q_lock; + dsl_scan_io_queue_t *queue; + scan_io_t srch, *sio; + avl_index_t idx; + uint64_t start, size; + vdev = vdev_lookup_top(spa, DVA_GET_VDEV(&bp->blk_dva[dva_i])); + ASSERT(vdev != NULL); + q_lock = &vdev->vdev_scan_io_queue_lock; + queue = vdev->vdev_scan_io_queue; + + mutex_enter(q_lock); + if (queue == NULL) { + mutex_exit(q_lock); + return; + } + + bp2sio(bp, &srch, dva_i); + start = srch.sio_offset; + size = srch.sio_asize; + /* - * Purge all vdev caches and probe all devices. We do this here - * rather than in sync context because this requires a writer lock - * on the spa_config lock, which we can't do from sync context. The - * spa_scrub_reopen flag indicates that vdev_open() should not - * attempt to start another scrub. + * We can find the zio in two states: + * 1) Cold, just sitting in the queue of zio's to be issued at + * some point in the future. In this case, all we do is + * remove the zio from the q_sios_by_addr tree, decrement + * its data volume from the containing range_seg_t and + * resort the q_exts_by_size tree to reflect that the + * range_seg_t has lost some of its 'fill'. We don't shorten + * the range_seg_t - this is usually rare enough not to be + * worth the extra hassle of trying keep track of precise + * extent boundaries. + * 2) Hot, where the zio is currently in-flight in + * dsl_scan_issue_ios. In this case, we can't simply + * reach in and stop the in-flight zio's, so we instead + * block the caller. Eventually, dsl_scan_issue_ios will + * be done with issuing the zio's it gathered and will + * signal us. */ - spa_vdev_state_enter(spa, SCL_NONE); - spa->spa_scrub_reopen = B_TRUE; - vdev_reopen(spa->spa_root_vdev); - spa->spa_scrub_reopen = B_FALSE; - (void) spa_vdev_state_exit(spa, NULL, 0); + sio = avl_find(&queue->q_sios_by_addr, &srch, &idx); + if (sio != NULL) { + int64_t asize = sio->sio_asize; + blkptr_t tmpbp; - if (func == POOL_SCAN_SCRUB && dsl_scan_is_paused_scrub(scn)) { - /* got scrub start cmd, resume paused scrub */ - int err = dsl_scrub_set_pause_resume(scn->scn_dp, - POOL_SCRUB_NORMAL); - if (err == 0) { - spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_RESUME); - return (ECANCELED); - } + /* Got it while it was cold in the queue */ + ASSERT3U(start, ==, sio->sio_offset); + ASSERT3U(size, ==, asize); + avl_remove(&queue->q_sios_by_addr, sio); - return (SET_ERROR(err)); - } + ASSERT(range_tree_contains(queue->q_exts_by_addr, start, size)); + range_tree_remove_fill(queue->q_exts_by_addr, start, size); - return (dsl_sync_task(spa_name(spa), dsl_scan_setup_check, - dsl_scan_setup_sync, &func, 0, ZFS_SPACE_CHECK_EXTRA_RESERVED)); + /* + * We only update scn_bytes_pending in the cold path, + * otherwise it will already have been accounted for as + * part of the zio's execution. + */ + atomic_add_64(&scn->scn_bytes_pending, -asize); + + /* count the block as though we issued it */ + sio2bp(sio, &tmpbp, dva_i); + count_block(scn, dp->dp_blkstats, &tmpbp); + + kmem_free(sio, sizeof (*sio)); + } + mutex_exit(q_lock); } -static boolean_t -dsl_scan_restarting(dsl_scan_t *scn, dmu_tx_t *tx) +/* + * Callback invoked when a zio_free() zio is executing. This needs to be + * intercepted to prevent the zio from deallocating a particular portion + * of disk space and it then getting reallocated and written to, while we + * still have it queued up for processing. + */ +void +dsl_scan_freed(spa_t *spa, const blkptr_t *bp) { - return (scn->scn_restart_txg != 0 && - scn->scn_restart_txg <= tx->tx_txg); + dsl_pool_t *dp = spa->spa_dsl_pool; + dsl_scan_t *scn = dp->dp_scan; + + ASSERT(!BP_IS_EMBEDDED(bp)); + ASSERT(scn != NULL); + if (!dsl_scan_is_running(scn)) + return; + + for (int i = 0; i < BP_GET_NDVAS(bp); i++) + dsl_scan_freed_dva(spa, bp, i); } Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c =================================================================== --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c @@ -1120,85 +1120,6 @@ } /* - * Create any block allocator specific components. The current allocators - * rely on using both a size-ordered range_tree_t and an array of uint64_t's. - */ -static void -metaslab_rt_create(range_tree_t *rt, void *arg) -{ - metaslab_t *msp = arg; - - ASSERT3P(rt->rt_arg, ==, msp); - ASSERT(msp->ms_allocatable == NULL); - - avl_create(&msp->ms_allocatable_by_size, metaslab_rangesize_compare, - sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node)); -} - -/* - * Destroy the block allocator specific components. - */ -static void -metaslab_rt_destroy(range_tree_t *rt, void *arg) -{ - metaslab_t *msp = arg; - - ASSERT3P(rt->rt_arg, ==, msp); - ASSERT3P(msp->ms_allocatable, ==, rt); - ASSERT0(avl_numnodes(&msp->ms_allocatable_by_size)); - - avl_destroy(&msp->ms_allocatable_by_size); -} - -static void -metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg) -{ - metaslab_t *msp = arg; - - ASSERT3P(rt->rt_arg, ==, msp); - ASSERT3P(msp->ms_allocatable, ==, rt); - VERIFY(!msp->ms_condensing); - avl_add(&msp->ms_allocatable_by_size, rs); -} - -static void -metaslab_rt_remove(range_tree_t *rt, range_seg_t *rs, void *arg) -{ - metaslab_t *msp = arg; - - ASSERT3P(rt->rt_arg, ==, msp); - ASSERT3P(msp->ms_allocatable, ==, rt); - VERIFY(!msp->ms_condensing); - avl_remove(&msp->ms_allocatable_by_size, rs); -} - -static void -metaslab_rt_vacate(range_tree_t *rt, void *arg) -{ - metaslab_t *msp = arg; - - ASSERT3P(rt->rt_arg, ==, msp); - ASSERT3P(msp->ms_allocatable, ==, rt); - - /* - * Normally one would walk the tree freeing nodes along the way. - * Since the nodes are shared with the range trees we can avoid - * walking all nodes and just reinitialize the avl tree. The nodes - * will be freed by the range tree, so we don't want to free them here. - */ - avl_create(&msp->ms_allocatable_by_size, metaslab_rangesize_compare, - sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node)); -} - -static range_tree_ops_t metaslab_rt_ops = { - metaslab_rt_create, - metaslab_rt_destroy, - metaslab_rt_add, - metaslab_rt_remove, - metaslab_rt_vacate -}; - -/* * ========================================================================== * Common allocator routines * ========================================================================== @@ -1574,7 +1495,8 @@ * addition of new space; and for debugging, it ensures that we'd * data fault on any attempt to use this metaslab before it's ready. */ - ms->ms_allocatable = range_tree_create(&metaslab_rt_ops, ms); + ms->ms_allocatable = range_tree_create_impl(&rt_avl_ops, &ms->ms_allocatable_by_size, + metaslab_rangesize_compare, 0); metaslab_group_add(mg, ms); metaslab_set_fragmentation(ms); Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/range_tree.c =================================================================== --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/range_tree.c +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/range_tree.c @@ -33,8 +33,58 @@ #include #include +/* + * Range trees are tree-based data structures that can be used to + * track free space or generally any space allocation information. + * A range tree keeps track of individual segments and automatically + * provides facilities such as adjacent extent merging and extent + * splitting in response to range add/remove requests. + * + * A range tree starts out completely empty, with no segments in it. + * Adding an allocation via range_tree_add to the range tree can either: + * 1) create a new extent + * 2) extend an adjacent extent + * 3) merge two adjacent extents + * Conversely, removing an allocation via range_tree_remove can: + * 1) completely remove an extent + * 2) shorten an extent (if the allocation was near one of its ends) + * 3) split an extent into two extents, in effect punching a hole + * + * A range tree is also capable of 'bridging' gaps when adding + * allocations. This is useful for cases when close proximity of + * allocations is an important detail that needs to be represented + * in the range tree. See range_tree_set_gap(). The default behavior + * is not to bridge gaps (i.e. the maximum allowed gap size is 0). + * + * In order to traverse a range tree, use either the range_tree_walk() + * or range_tree_vacate() functions. + * + * To obtain more accurate information on individual segment + * operations that the range tree performs "under the hood", you can + * specify a set of callbacks by passing a range_tree_ops_t structure + * to the range_tree_create function. Any callbacks that are non-NULL + * are then called at the appropriate times. + * + * The range tree code also supports a special variant of range trees + * that can bridge small gaps between segments. This kind of tree is used + * by the dsl scanning code to group I/Os into mostly sequential chunks to + * optimize disk performance. The code here attempts to do this with as + * little memory and computational overhead as possible. One limitation of + * this implementation is that segments of range trees with gaps can only + * support removing complete segments. + */ + kmem_cache_t *range_seg_cache; +/* Generic ops for managing an AVL tree alongside a range tree */ +struct range_tree_ops rt_avl_ops = { + .rtop_create = rt_avl_create, + .rtop_destroy = rt_avl_destroy, + .rtop_add = rt_avl_add, + .rtop_remove = rt_avl_remove, + .rtop_vacate = rt_avl_vacate, +}; + void range_tree_init(void) { @@ -109,47 +159,47 @@ static int range_tree_seg_compare(const void *x1, const void *x2) { - const range_seg_t *r1 = x1; - const range_seg_t *r2 = x2; + const range_seg_t *r1 = (const range_seg_t *)x1; + const range_seg_t *r2 = (const range_seg_t *)x2; - if (r1->rs_start < r2->rs_start) { - if (r1->rs_end > r2->rs_start) - return (0); - return (-1); - } - if (r1->rs_start > r2->rs_start) { - if (r1->rs_start < r2->rs_end) - return (0); - return (1); - } - return (0); + ASSERT3U(r1->rs_start, <=, r1->rs_end); + ASSERT3U(r2->rs_start, <=, r2->rs_end); + + return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start)); } range_tree_t * -range_tree_create(range_tree_ops_t *ops, void *arg) +range_tree_create_impl(range_tree_ops_t *ops, void *arg, + int (*avl_compare) (const void *, const void *), uint64_t gap) { - range_tree_t *rt; + range_tree_t *rt = kmem_zalloc(sizeof (range_tree_t), KM_SLEEP); - rt = kmem_zalloc(sizeof (range_tree_t), KM_SLEEP); - avl_create(&rt->rt_root, range_tree_seg_compare, sizeof (range_seg_t), offsetof(range_seg_t, rs_node)); rt->rt_ops = ops; rt->rt_arg = arg; + rt->rt_gap = gap; + rt->rt_avl_compare = avl_compare; - if (rt->rt_ops != NULL) + if (rt->rt_ops != NULL && rt->rt_ops->rtop_create != NULL) rt->rt_ops->rtop_create(rt, rt->rt_arg); return (rt); } +range_tree_t * +range_tree_create(range_tree_ops_t *ops, void *arg) +{ + return (range_tree_create_impl(ops, arg, NULL, 0)); +} + void range_tree_destroy(range_tree_t *rt) { VERIFY0(rt->rt_space); - if (rt->rt_ops != NULL) + if (rt->rt_ops != NULL && rt->rt_ops->rtop_destroy != NULL) rt->rt_ops->rtop_destroy(rt, rt->rt_arg); avl_destroy(&rt->rt_root); @@ -157,39 +207,99 @@ } void -range_tree_add(void *arg, uint64_t start, uint64_t size) +range_tree_adjust_fill(range_tree_t *rt, range_seg_t *rs, int64_t delta) { + ASSERT3U(rs->rs_fill + delta, !=, 0); + ASSERT3U(rs->rs_fill + delta, <=, rs->rs_end - rs->rs_start); + + if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) + rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg); + rs->rs_fill += delta; + if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL) + rt->rt_ops->rtop_add(rt, rs, rt->rt_arg); +} + +static void +range_tree_add_impl(void *arg, uint64_t start, uint64_t size, uint64_t fill) +{ range_tree_t *rt = arg; avl_index_t where; range_seg_t rsearch, *rs_before, *rs_after, *rs; - uint64_t end = start + size; + uint64_t end = start + size, gap = rt->rt_gap; + uint64_t bridge_size = 0; boolean_t merge_before, merge_after; - VERIFY(size != 0); + ASSERT3U(size, !=, 0); + ASSERT3U(fill, <=, size); rsearch.rs_start = start; rsearch.rs_end = end; rs = avl_find(&rt->rt_root, &rsearch, &where); - if (rs != NULL && rs->rs_start <= start && rs->rs_end >= end) { + if (gap == 0 && rs != NULL && + rs->rs_start <= start && rs->rs_end >= end) { zfs_panic_recover("zfs: allocating allocated segment" - "(offset=%llu size=%llu)\n", - (longlong_t)start, (longlong_t)size); + "(offset=%llu size=%llu) of (offset=%llu size=%llu)\n", + (longlong_t)start, (longlong_t)size, + (longlong_t)rs->rs_start, + (longlong_t)rs->rs_end - rs->rs_start); return; } - /* Make sure we don't overlap with either of our neighbors */ - VERIFY(rs == NULL); + /* + * If this is a gap-supporting range tree, it is possible that we + * are inserting into an existing segment. In this case simply + * bump the fill count and call the remove / add callbacks. If the + * new range will extend an existing segment, we remove the + * existing one, apply the new extent to it and re-insert it using + * the normal code paths. + */ + if (rs != NULL) { + ASSERT3U(gap, !=, 0); + if (rs->rs_start <= start && rs->rs_end >= end) { + range_tree_adjust_fill(rt, rs, fill); + return; + } + avl_remove(&rt->rt_root, rs); + if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) + rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg); + + range_tree_stat_decr(rt, rs); + rt->rt_space -= rs->rs_end - rs->rs_start; + + fill += rs->rs_fill; + start = MIN(start, rs->rs_start); + end = MAX(end, rs->rs_end); + size = end - start; + + range_tree_add_impl(rt, start, size, fill); + + kmem_cache_free(range_seg_cache, rs); + return; + } + + ASSERT3P(rs, ==, NULL); + + /* + * Determine whether or not we will have to merge with our neighbors. + * If gap != 0, we might need to merge with our neighbors even if we + * aren't directly touching. + */ rs_before = avl_nearest(&rt->rt_root, where, AVL_BEFORE); rs_after = avl_nearest(&rt->rt_root, where, AVL_AFTER); - merge_before = (rs_before != NULL && rs_before->rs_end == start); - merge_after = (rs_after != NULL && rs_after->rs_start == end); + merge_before = (rs_before != NULL && rs_before->rs_end >= start - gap); + merge_after = (rs_after != NULL && rs_after->rs_start <= end + gap); + if (merge_before && gap != 0) + bridge_size += start - rs_before->rs_end; + if (merge_after && gap != 0) + bridge_size += rs_after->rs_start - end; + if (merge_before && merge_after) { avl_remove(&rt->rt_root, rs_before); - if (rt->rt_ops != NULL) { + if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) { rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg); rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg); } @@ -197,43 +307,59 @@ range_tree_stat_decr(rt, rs_before); range_tree_stat_decr(rt, rs_after); + rs_after->rs_fill += rs_before->rs_fill + fill; rs_after->rs_start = rs_before->rs_start; kmem_cache_free(range_seg_cache, rs_before); rs = rs_after; } else if (merge_before) { - if (rt->rt_ops != NULL) + if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg); range_tree_stat_decr(rt, rs_before); + rs_before->rs_fill += fill; rs_before->rs_end = end; rs = rs_before; } else if (merge_after) { - if (rt->rt_ops != NULL) + if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg); range_tree_stat_decr(rt, rs_after); + rs_after->rs_fill += fill; rs_after->rs_start = start; rs = rs_after; } else { rs = kmem_cache_alloc(range_seg_cache, KM_SLEEP); + + rs->rs_fill = fill; rs->rs_start = start; rs->rs_end = end; avl_insert(&rt->rt_root, rs, where); } - if (rt->rt_ops != NULL) + if (gap != 0) + ASSERT3U(rs->rs_fill, <=, rs->rs_end - rs->rs_start); + else + ASSERT3U(rs->rs_fill, ==, rs->rs_end - rs->rs_start); + + if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL) rt->rt_ops->rtop_add(rt, rs, rt->rt_arg); range_tree_stat_incr(rt, rs); - rt->rt_space += size; + rt->rt_space += size + bridge_size; } void -range_tree_remove(void *arg, uint64_t start, uint64_t size) +range_tree_add(void *arg, uint64_t start, uint64_t size) { - range_tree_t *rt = arg; + range_tree_add_impl(arg, start, size, size); +} + +static void +range_tree_remove_impl(range_tree_t *rt, uint64_t start, uint64_t size, + boolean_t do_fill) +{ avl_index_t where; range_seg_t rsearch, *rs, *newseg; uint64_t end = start + size; @@ -253,6 +379,34 @@ (longlong_t)start, (longlong_t)size); return; } + + /* + * Range trees with gap support must only remove complete segments + * from the tree. This allows us to maintain accurate fill accounting + * and to ensure that bridged sections are not leaked. If we need to + * remove less than the full segment, we can only adjust the fill count. + */ + if (rt->rt_gap != 0) { + if (do_fill) { + if (rs->rs_fill == size) { + start = rs->rs_start; + end = rs->rs_end; + size = end - start; + } else { + range_tree_adjust_fill(rt, rs, -size); + return; + } + } else if (rs->rs_start != start || rs->rs_end != end) { + zfs_panic_recover("zfs: freeing partial segment of " + "gap tree (offset=%llu size=%llu) of " + "(offset=%llu size=%llu)", + (longlong_t)start, (longlong_t)size, + (longlong_t)rs->rs_start, + (longlong_t)rs->rs_end - rs->rs_start); + return; + } + } + VERIFY3U(rs->rs_start, <=, start); VERIFY3U(rs->rs_end, >=, end); @@ -261,19 +415,20 @@ range_tree_stat_decr(rt, rs); - if (rt->rt_ops != NULL) + if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg); if (left_over && right_over) { newseg = kmem_cache_alloc(range_seg_cache, KM_SLEEP); newseg->rs_start = end; newseg->rs_end = rs->rs_end; + newseg->rs_fill = newseg->rs_end - newseg->rs_start; range_tree_stat_incr(rt, newseg); rs->rs_end = start; avl_insert_here(&rt->rt_root, newseg, rs, AVL_AFTER); - if (rt->rt_ops != NULL) + if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL) rt->rt_ops->rtop_add(rt, newseg, rt->rt_arg); } else if (left_over) { rs->rs_end = start; @@ -286,15 +441,53 @@ } if (rs != NULL) { + /* + * The fill of the leftover segment will always be equal to + * the size, since we do not support removing partial segments + * of range trees with gaps. + */ + rs->rs_fill = rs->rs_end - rs->rs_start; range_tree_stat_incr(rt, rs); - if (rt->rt_ops != NULL) + if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL) rt->rt_ops->rtop_add(rt, rs, rt->rt_arg); } rt->rt_space -= size; } +void +range_tree_remove(void *arg, uint64_t start, uint64_t size) +{ + range_tree_remove_impl(arg, start, size, B_FALSE); +} + +void +range_tree_remove_fill(range_tree_t *rt, uint64_t start, uint64_t size) +{ + range_tree_remove_impl(rt, start, size, B_TRUE); +} + +void +range_tree_resize_segment(range_tree_t *rt, range_seg_t *rs, + uint64_t newstart, uint64_t newsize) +{ + int64_t delta = newsize - (rs->rs_end - rs->rs_start); + + range_tree_stat_decr(rt, rs); + if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) + rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg); + + rs->rs_start = newstart; + rs->rs_end = newstart + newsize; + + range_tree_stat_incr(rt, rs); + if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL) + rt->rt_ops->rtop_add(rt, rs, rt->rt_arg); + + rt->rt_space += delta; +} + static range_seg_t * range_tree_find_impl(range_tree_t *rt, uint64_t start, uint64_t size) { @@ -309,7 +502,7 @@ return (avl_find(&rt->rt_root, &rsearch, &where)); } -static range_seg_t * +range_seg_t * range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size) { range_seg_t *rs = range_tree_find_impl(rt, start, size); @@ -373,7 +566,7 @@ void *cookie = NULL; - if (rt->rt_ops != NULL) + if (rt->rt_ops != NULL && rt->rt_ops->rtop_vacate != NULL) rt->rt_ops->rtop_vacate(rt, rt->rt_arg); while ((rs = avl_destroy_nodes(&rt->rt_root, &cookie)) != NULL) { @@ -395,10 +588,61 @@ func(arg, rs->rs_start, rs->rs_end - rs->rs_start); } +range_seg_t * +range_tree_first(range_tree_t *rt) +{ + return (avl_first(&rt->rt_root)); +} + uint64_t range_tree_space(range_tree_t *rt) { return (rt->rt_space); +} + +/* Generic range tree functions for maintaining segments in an AVL tree. */ +void +rt_avl_create(range_tree_t *rt, void *arg) +{ + avl_tree_t *tree = arg; + + avl_create(tree, rt->rt_avl_compare, sizeof (range_seg_t), + offsetof(range_seg_t, rs_pp_node)); +} + +void +rt_avl_destroy(range_tree_t *rt, void *arg) +{ + avl_tree_t *tree = arg; + + ASSERT0(avl_numnodes(tree)); + avl_destroy(tree); +} + +void +rt_avl_add(range_tree_t *rt, range_seg_t *rs, void *arg) +{ + avl_tree_t *tree = arg; + avl_add(tree, rs); +} + +void +rt_avl_remove(range_tree_t *rt, range_seg_t *rs, void *arg) +{ + avl_tree_t *tree = arg; + avl_remove(tree, rs); +} + +void +rt_avl_vacate(range_tree_t *rt, void *arg) +{ + /* + * Normally one would walk the tree freeing nodes along the way. + * Since the nodes are shared with the range trees we can avoid + * walking all nodes and just reinitialize the avl tree. The nodes + * will be freed by the range tree, so we don't want to free them here. + */ + rt_avl_create(rt, arg); } boolean_t Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c =================================================================== --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c @@ -2035,7 +2035,7 @@ } mutex_enter(&spa->spa_scrub_lock); - spa->spa_scrub_inflight--; + spa->spa_load_verify_ios--; cv_broadcast(&spa->spa_scrub_io_cv); mutex_exit(&spa->spa_scrub_lock); } @@ -2082,9 +2082,9 @@ size_t size = BP_GET_PSIZE(bp); mutex_enter(&spa->spa_scrub_lock); - while (spa->spa_scrub_inflight >= spa_load_verify_maxinflight) + while (spa->spa_load_verify_ios >= spa_load_verify_maxinflight) cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); - spa->spa_scrub_inflight++; + spa->spa_load_verify_ios++; mutex_exit(&spa->spa_scrub_lock); zio_nowait(zio_read(rio, spa, bp, abd_alloc_for_io(size, B_FALSE), size, Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c =================================================================== --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c @@ -2095,6 +2095,8 @@ zpool_feature_init(); spa_config_load(); l2arc_start(); + scan_init(); + dsl_scan_global_init(); #ifndef illumos #ifdef _KERNEL zfs_deadman_init(); @@ -2119,7 +2121,8 @@ range_tree_fini(); unique_fini(); refcount_fini(); - + scan_fini(); + avl_destroy(&spa_namespace_avl); avl_destroy(&spa_spare_avl); avl_destroy(&spa_l2cache_avl); @@ -2220,6 +2223,7 @@ spa->spa_scan_pass_scrub_pause = 0; spa->spa_scan_pass_scrub_spent_paused = 0; spa->spa_scan_pass_exam = 0; + spa->spa_scan_pass_issued = 0; vdev_scan_stat_init(spa->spa_root_vdev); } @@ -2237,18 +2241,20 @@ /* data stored on disk */ ps->pss_func = scn->scn_phys.scn_func; + ps->pss_state = scn->scn_phys.scn_state; ps->pss_start_time = scn->scn_phys.scn_start_time; ps->pss_end_time = scn->scn_phys.scn_end_time; ps->pss_to_examine = scn->scn_phys.scn_to_examine; - ps->pss_examined = scn->scn_phys.scn_examined; ps->pss_to_process = scn->scn_phys.scn_to_process; ps->pss_processed = scn->scn_phys.scn_processed; ps->pss_errors = scn->scn_phys.scn_errors; - ps->pss_state = scn->scn_phys.scn_state; - + ps->pss_examined = scn->scn_phys.scn_examined; + ps->pss_issued = + scn->scn_issued_before_pass + spa->spa_scan_pass_issued; /* data not stored on disk */ ps->pss_pass_start = spa->spa_scan_pass_start; ps->pss_pass_exam = spa->spa_scan_pass_exam; + ps->pss_pass_issued = spa->spa_scan_pass_issued; ps->pss_pass_scrub_pause = spa->spa_scan_pass_scrub_pause; ps->pss_pass_scrub_spent_paused = spa->spa_scan_pass_scrub_spent_paused; Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h =================================================================== --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h @@ -58,11 +58,13 @@ typedef struct arc_buf_hdr arc_buf_hdr_t; typedef struct arc_buf arc_buf_t; -typedef void arc_done_func_t(zio_t *zio, arc_buf_t *buf, void *priv); +typedef void arc_read_done_func_t(zio_t *zio, const zbookmark_phys_t *zb, + const blkptr_t *bp, arc_buf_t *buf, void *priv); +typedef void arc_write_done_func_t(zio_t *zio, arc_buf_t *buf, void *priv); /* generic arc_done_func_t's which you can use */ -arc_done_func_t arc_bcopy_func; -arc_done_func_t arc_getbuf_func; +arc_read_done_func_t arc_bcopy_func; +arc_read_done_func_t arc_getbuf_func; typedef enum arc_flags { @@ -75,35 +77,36 @@ ARC_FLAG_CACHED = 1 << 3, /* I/O was in cache */ ARC_FLAG_L2CACHE = 1 << 4, /* cache in L2ARC */ ARC_FLAG_PREDICTIVE_PREFETCH = 1 << 5, /* I/O from zfetch */ + ARC_FLAG_PRESCIENT_PREFETCH = 1 << 6, /* long min lifespan */ /* * Private ARC flags. These flags are private ARC only flags that * will show up in b_flags in the arc_hdr_buf_t. These flags should * only be set by ARC code. */ - ARC_FLAG_IN_HASH_TABLE = 1 << 6, /* buffer is hashed */ - ARC_FLAG_IO_IN_PROGRESS = 1 << 7, /* I/O in progress */ - ARC_FLAG_IO_ERROR = 1 << 8, /* I/O failed for buf */ - ARC_FLAG_INDIRECT = 1 << 9, /* indirect block */ + ARC_FLAG_IN_HASH_TABLE = 1 << 7, /* buffer is hashed */ + ARC_FLAG_IO_IN_PROGRESS = 1 << 8, /* I/O in progress */ + ARC_FLAG_IO_ERROR = 1 << 9, /* I/O failed for buf */ + ARC_FLAG_INDIRECT = 1 << 10, /* indirect block */ /* Indicates that block was read with ASYNC priority. */ - ARC_FLAG_PRIO_ASYNC_READ = 1 << 10, - ARC_FLAG_L2_WRITING = 1 << 11, /* write in progress */ - ARC_FLAG_L2_EVICTED = 1 << 12, /* evicted during I/O */ - ARC_FLAG_L2_WRITE_HEAD = 1 << 13, /* head of write list */ + ARC_FLAG_PRIO_ASYNC_READ = 1 << 11, + ARC_FLAG_L2_WRITING = 1 << 12, /* write in progress */ + ARC_FLAG_L2_EVICTED = 1 << 13, /* evicted during I/O */ + ARC_FLAG_L2_WRITE_HEAD = 1 << 14, /* head of write list */ /* indicates that the buffer contains metadata (otherwise, data) */ - ARC_FLAG_BUFC_METADATA = 1 << 14, + ARC_FLAG_BUFC_METADATA = 1 << 15, /* Flags specifying whether optional hdr struct fields are defined */ - ARC_FLAG_HAS_L1HDR = 1 << 15, - ARC_FLAG_HAS_L2HDR = 1 << 16, + ARC_FLAG_HAS_L1HDR = 1 << 16, + ARC_FLAG_HAS_L2HDR = 1 << 17, /* * Indicates the arc_buf_hdr_t's b_pdata matches the on-disk data. * This allows the l2arc to use the blkptr's checksum to verify * the data without having to store the checksum in the hdr. */ - ARC_FLAG_COMPRESSED_ARC = 1 << 17, - ARC_FLAG_SHARED_DATA = 1 << 18, + ARC_FLAG_COMPRESSED_ARC = 1 << 18, + ARC_FLAG_SHARED_DATA = 1 << 19, /* * The arc buffer's compression mode is stored in the top 7 bits of the @@ -179,12 +182,12 @@ #endif int arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, - arc_done_func_t *done, void *priv, zio_priority_t priority, int flags, - arc_flags_t *arc_flags, const zbookmark_phys_t *zb); + arc_read_done_func_t *done, void *priv, zio_priority_t priority, + int flags, arc_flags_t *arc_flags, const zbookmark_phys_t *zb); zio_t *arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, const zio_prop_t *zp, - arc_done_func_t *ready, arc_done_func_t *child_ready, - arc_done_func_t *physdone, arc_done_func_t *done, + arc_write_done_func_t *ready, arc_write_done_func_t *child_ready, + arc_write_done_func_t *physdone, arc_write_done_func_t *done, void *priv, zio_priority_t priority, int zio_flags, const zbookmark_phys_t *zb); void arc_freed(spa_t *spa, const blkptr_t *bp); Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h =================================================================== --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h @@ -76,6 +76,7 @@ typedef struct zfs_all_blkstats { zfs_blkstat_t zab_type[DN_MAX_LEVELS + 1][DMU_OT_TOTAL + 1]; + kmutex_t zab_lock; } zfs_all_blkstats_t; Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_scan.h =================================================================== --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_scan.h +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_scan.h @@ -107,24 +107,58 @@ typedef struct dsl_scan { struct dsl_pool *scn_dp; - boolean_t scn_suspending; uint64_t scn_restart_txg; uint64_t scn_done_txg; uint64_t scn_sync_start_time; - zio_t *scn_zio_root; + uint64_t scn_issued_before_pass; /* for freeing blocks */ boolean_t scn_is_bptree; boolean_t scn_async_destroying; boolean_t scn_async_stalled; uint64_t scn_async_block_min_time_ms; + /* flags and stats for controlling scan state */ + boolean_t scn_is_sorted; /* doing sequential scan */ + boolean_t scn_clearing; /* scan is issuing sequential extents */ + boolean_t scn_checkpointing; /* scan is issuing all queued extents */ + boolean_t scn_suspending; /* scan is suspending until next txg */ + uint64_t scn_last_checkpoint; /* time of last checkpoint */ - /* for debugging / information */ - uint64_t scn_visited_this_txg; + /* members for thread synchronization */ + zio_t *scn_zio_root; /* root zio for waiting on IO */ + taskq_t *scn_taskq; /* task queue for issuing extents */ - dsl_scan_phys_t scn_phys; + /* for controlling scan prefetch, protected by spa_scrub_lock */ + boolean_t scn_prefetch_stop; /* prefetch should stop */ + zbookmark_phys_t scn_prefetch_bookmark; /* prefetch start bookmark */ + avl_tree_t scn_prefetch_queue; /* priority queue of prefetch IOs */ + uint64_t scn_maxinflight_bytes; /* max bytes in flight for poool */ + + /* per txg statistics */ + uint64_t scn_visited_this_txg; /* total bps visited this txg */ + uint64_t scn_holes_this_txg; + uint64_t scn_lt_min_this_txg; + uint64_t scn_gt_max_this_txg; + uint64_t scn_ddt_contained_this_txg; + uint64_t scn_objsets_visited_this_txg; + uint64_t scn_avg_seg_size_this_txg; + uint64_t scn_segs_this_txg; + uint64_t scn_avg_zio_size_this_txg; + uint64_t scn_zios_this_txg; + + /* members needed for syncing scan status to disk */ + dsl_scan_phys_t scn_phys; /* on disk representation of scan */ + dsl_scan_phys_t scn_phys_cached; + avl_tree_t scn_queue; /* queue of datasets to scan */ + uint64_t scn_bytes_pending; /* outstanding data to issue */ } dsl_scan_t; +typedef struct dsl_scan_io_queue dsl_scan_io_queue_t; + +void dsl_scan_global_init(void); + +void scan_init(void); +void scan_fini(void); int dsl_scan_init(struct dsl_pool *dp, uint64_t txg); void dsl_scan_fini(struct dsl_pool *dp); void dsl_scan_sync(struct dsl_pool *, dmu_tx_t *); @@ -143,6 +177,9 @@ struct dmu_tx *tx); boolean_t dsl_scan_active(dsl_scan_t *scn); boolean_t dsl_scan_is_paused_scrub(const dsl_scan_t *scn); +void dsl_scan_freed(spa_t *spa, const blkptr_t *bp); +void dsl_scan_io_queue_destroy(dsl_scan_io_queue_t *queue); +void dsl_scan_io_queue_vdev_xfer(vdev_t *svd, vdev_t *tvd); #ifdef __cplusplus } Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/range_tree.h =================================================================== --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/range_tree.h +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/range_tree.h @@ -51,6 +51,9 @@ range_tree_ops_t *rt_ops; void *rt_arg; + /* rt_avl_compare should only be set it rt_arg is an AVL tree */ + uint64_t rt_gap; /* allowable inter-segment gap */ + int (*rt_avl_compare)(const void *, const void *); /* * The rt_histogram maintains a histogram of ranges. Each bucket, * rt_histogram[i], contains the number of ranges whose size is: @@ -64,6 +67,7 @@ avl_node_t rs_pp_node; /* AVL picker-private node */ uint64_t rs_start; /* starting offset of this segment */ uint64_t rs_end; /* ending offset (non-inclusive) */ + uint64_t rs_fill; /* actual fill if gap mode is on */ } range_seg_t; struct range_tree_ops { @@ -78,9 +82,14 @@ void range_tree_init(void); void range_tree_fini(void); -range_tree_t *range_tree_create(range_tree_ops_t *ops, void *arg); +range_tree_t *range_tree_create_impl(range_tree_ops_t *ops, void *arg, + int (*avl_compare)(const void*, const void*), uint64_t gap); + range_tree_t *range_tree_create(range_tree_ops_t *ops, void *arg); void range_tree_destroy(range_tree_t *rt); boolean_t range_tree_contains(range_tree_t *rt, uint64_t start, uint64_t size); +range_seg_t *range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size); +void range_tree_resize_segment(range_tree_t *rt, range_seg_t *rs, + uint64_t newstart, uint64_t newsize); uint64_t range_tree_space(range_tree_t *rt); boolean_t range_tree_is_empty(range_tree_t *rt); void range_tree_verify(range_tree_t *rt, uint64_t start, uint64_t size); @@ -89,10 +98,27 @@ void range_tree_add(void *arg, uint64_t start, uint64_t size); void range_tree_remove(void *arg, uint64_t start, uint64_t size); +void range_tree_remove_fill(range_tree_t *rt, uint64_t start, uint64_t size); +void range_tree_adjust_fill(range_tree_t *rt, range_seg_t *rs, int64_t delta); void range_tree_clear(range_tree_t *rt, uint64_t start, uint64_t size); void range_tree_vacate(range_tree_t *rt, range_tree_func_t *func, void *arg); void range_tree_walk(range_tree_t *rt, range_tree_func_t *func, void *arg); +range_seg_t *range_tree_first(range_tree_t *rt); + +void rt_avl_create(range_tree_t *rt, void *arg); +void rt_avl_destroy(range_tree_t *rt, void *arg); +void rt_avl_add(range_tree_t *rt, range_seg_t *rs, void *arg); +void rt_avl_remove(range_tree_t *rt, range_seg_t *rs, void *arg); +void rt_avl_vacate(range_tree_t *rt, void *arg); +extern struct range_tree_ops rt_avl_ops; + +void rt_avl_create(range_tree_t *rt, void *arg); +void rt_avl_destroy(range_tree_t *rt, void *arg); +void rt_avl_add(range_tree_t *rt, range_seg_t *rs, void *arg); +void rt_avl_remove(range_tree_t *rt, range_seg_t *rs, void *arg); +void rt_avl_vacate(range_tree_t *rt, void *arg); +extern struct range_tree_ops rt_avl_ops; #ifdef __cplusplus } Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h =================================================================== --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h @@ -257,7 +257,8 @@ boolean_t spa_extreme_rewind; /* rewind past deferred frees */ uint64_t spa_last_io; /* lbolt of last non-scan I/O */ kmutex_t spa_scrub_lock; /* resilver/scrub lock */ - uint64_t spa_scrub_inflight; /* in-flight scrub I/Os */ + uint64_t spa_scrub_inflight; /* in-flight scrub bytes */ + uint64_t spa_load_verify_ios; /* in-flight verifications IOs */ kcondvar_t spa_scrub_io_cv; /* scrub I/O completion */ uint8_t spa_scrub_active; /* active or suspended? */ uint8_t spa_scrub_type; /* type of scrub we're doing */ @@ -268,6 +269,7 @@ uint64_t spa_scan_pass_scrub_pause; /* scrub pause time */ uint64_t spa_scan_pass_scrub_spent_paused; /* total paused */ uint64_t spa_scan_pass_exam; /* examined bytes per pass */ + uint64_t spa_scan_pass_issued; /* issued bytes per pass */ kmutex_t spa_async_lock; /* protect async state */ kthread_t *spa_async_thread; /* thread doing async task */ kthread_t *spa_async_thread_vd; /* thread doing vd async task */ Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h =================================================================== --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h @@ -71,6 +71,7 @@ extern boolean_t vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t d, uint64_t txg, uint64_t size); extern boolean_t vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t d); +extern boolean_t vdev_dtl_need_resilver(vdev_t *vd, uint64_t off, size_t size); extern void vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done); extern boolean_t vdev_dtl_required(vdev_t *vd); @@ -135,6 +136,7 @@ extern void vdev_queue_fini(vdev_t *vd); extern zio_t *vdev_queue_io(zio_t *zio); extern void vdev_queue_io_done(zio_t *zio); +extern void vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority); extern int vdev_queue_length(vdev_t *vd); extern uint64_t vdev_queue_lastoffset(vdev_t *vd); extern void vdev_queue_register_lastoffset(vdev_t *vd, zio_t *zio); Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h =================================================================== --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h @@ -71,6 +71,7 @@ typedef void vdev_io_start_func_t(zio_t *zio); typedef void vdev_io_done_func_t(zio_t *zio); typedef void vdev_state_change_func_t(vdev_t *vd, int, int); +typedef boolean_t vdev_need_resilver_func_t(vdev_t *vd, uint64_t, size_t); typedef void vdev_hold_func_t(vdev_t *vd); typedef void vdev_rele_func_t(vdev_t *vd); @@ -86,6 +87,7 @@ vdev_io_start_func_t *vdev_op_io_start; vdev_io_done_func_t *vdev_op_io_done; vdev_state_change_func_t *vdev_op_state_change; + vdev_need_resilver_func_t *vdev_op_need_resilver; vdev_hold_func_t *vdev_op_hold; vdev_rele_func_t *vdev_op_rele; vdev_remap_func_t *vdev_op_remap; @@ -293,6 +295,13 @@ */ uint64_t vdev_async_write_queue_depth; uint64_t vdev_max_async_write_queue_depth; + + /* + * Protects the vdev_scan_io_queue field itself as well as the + * structure's contents (when present). + */ + kmutex_t vdev_scan_io_queue_lock; + struct dsl_scan_io_queue *vdev_scan_io_queue; /* * Leaf vdev state. Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h =================================================================== --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h @@ -593,6 +593,8 @@ extern void zio_vdev_io_reissue(zio_t *zio); extern void zio_vdev_io_redone(zio_t *zio); +extern void zio_change_priority(zio_t *pio, zio_priority_t priority); + extern void zio_checksum_verified(zio_t *zio); extern int zio_worst_error(int e1, int e2); Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c =================================================================== --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c @@ -559,6 +559,8 @@ mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_queue_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&vd->vdev_scan_io_queue_lock, NULL, MUTEX_DEFAULT, NULL); + for (int t = 0; t < DTL_TYPES; t++) { vd->vdev_dtl[t] = range_tree_create(NULL, NULL); } @@ -832,6 +834,18 @@ spa_t *spa = vd->vdev_spa; /* + * Scan queues are normally destroyed at the end of a scan. If the + * queue exists here, that implies the vdev is being removed while + * the scan is still running. + */ + if (vd->vdev_scan_io_queue != NULL) { + mutex_enter(&vd->vdev_scan_io_queue_lock); + dsl_scan_io_queue_destroy(vd->vdev_scan_io_queue); + vd->vdev_scan_io_queue = NULL; + mutex_exit(&vd->vdev_scan_io_queue_lock); + } + + /* * vdev_free() implies closing the vdev first. This is simpler than * trying to ensure complicated semantics for all callers. */ @@ -920,6 +934,7 @@ mutex_destroy(&vd->vdev_dtl_lock); mutex_destroy(&vd->vdev_stat_lock); mutex_destroy(&vd->vdev_probe_lock); + mutex_destroy(&vd->vdev_scan_io_queue_lock); if (vd == spa->spa_root_vdev) spa->spa_root_vdev = NULL; @@ -996,6 +1011,8 @@ tvd->vdev_islog = svd->vdev_islog; svd->vdev_islog = 0; + + dsl_scan_io_queue_vdev_xfer(svd, tvd); } static void @@ -2286,6 +2303,21 @@ mutex_exit(&vd->vdev_dtl_lock); return (empty); +} + +/* + * Returns B_TRUE if vdev determines offset needs to be resilvered. + */ +boolean_t +vdev_dtl_need_resilver(vdev_t *vd, uint64_t offset, size_t psize) +{ + ASSERT(vd != vd->vdev_spa->spa_root_vdev); + + if (vd->vdev_ops->vdev_op_need_resilver == NULL || + vd->vdev_ops->vdev_op_leaf) + return (B_TRUE); + + return (vd->vdev_ops->vdev_op_need_resilver(vd, offset, psize)); } /* Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c =================================================================== --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c @@ -837,6 +837,7 @@ vdev_disk_io_start, vdev_disk_io_done, NULL, + NULL, vdev_disk_hold, vdev_disk_rele, NULL, Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c =================================================================== --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c @@ -267,6 +267,7 @@ vdev_file_io_start, vdev_file_io_done, NULL, + NULL, vdev_file_hold, vdev_file_rele, NULL, @@ -285,6 +286,7 @@ vdev_default_asize, vdev_file_io_start, vdev_file_io_done, + NULL, NULL, vdev_file_hold, vdev_file_rele, Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c =================================================================== --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c @@ -1147,6 +1147,7 @@ vdev_geom_io_start, vdev_geom_io_done, NULL, + NULL, vdev_geom_hold, vdev_geom_rele, NULL, Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect.c =================================================================== --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect.c +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect.c @@ -1111,6 +1111,7 @@ NULL, NULL, NULL, + NULL, vdev_indirect_remap, VDEV_TYPE_INDIRECT, /* name of this vdev type */ B_FALSE /* leaf vdev */ Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c =================================================================== --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c @@ -722,6 +722,7 @@ NULL, NULL, NULL, + NULL, VDEV_TYPE_MIRROR, /* name of this vdev type */ B_FALSE /* not a leaf vdev */ }; @@ -736,6 +737,7 @@ NULL, NULL, NULL, + NULL, VDEV_TYPE_REPLACING, /* name of this vdev type */ B_FALSE /* not a leaf vdev */ }; @@ -747,6 +749,7 @@ vdev_mirror_io_start, vdev_mirror_io_done, vdev_mirror_state_change, + NULL, NULL, NULL, NULL, Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c =================================================================== --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c @@ -90,6 +90,7 @@ NULL, NULL, NULL, + NULL, VDEV_TYPE_MISSING, /* name of this vdev type */ B_TRUE /* leaf vdev */ }; @@ -100,6 +101,7 @@ vdev_default_asize, vdev_missing_io_start, vdev_missing_io_done, + NULL, NULL, NULL, NULL, Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c =================================================================== --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c @@ -175,7 +175,7 @@ * we include spans of optional I/Os to aid aggregation at the disk even when * they aren't able to help us aggregate at this level. */ -int zfs_vdev_aggregation_limit = SPA_OLD_MAXBLOCKSIZE; +int zfs_vdev_aggregation_limit = 1 << 20; int zfs_vdev_read_gap_limit = 32 << 10; int zfs_vdev_write_gap_limit = 4 << 10; @@ -933,6 +933,48 @@ zio_execute(nio); } mutex_enter(&vq->vq_lock); + } + + mutex_exit(&vq->vq_lock); +} + +void +vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority) +{ + vdev_queue_t *vq = &zio->io_vd->vdev_queue; + avl_tree_t *tree; + + ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); + ASSERT3U(priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); + + if (zio->io_type == ZIO_TYPE_READ) { + if (priority != ZIO_PRIORITY_SYNC_READ && + priority != ZIO_PRIORITY_ASYNC_READ && + priority != ZIO_PRIORITY_SCRUB) + priority = ZIO_PRIORITY_ASYNC_READ; + } else { + ASSERT(zio->io_type == ZIO_TYPE_WRITE); + if (priority != ZIO_PRIORITY_SYNC_WRITE && + priority != ZIO_PRIORITY_ASYNC_WRITE) + priority = ZIO_PRIORITY_ASYNC_WRITE; + } + + mutex_enter(&vq->vq_lock); + + /* + * If the zio is in none of the queues we can simply change + * the priority. If the zio is waiting to be submitted we must + * remove it from the queue and re-insert it with the new priority. + * Otherwise, the zio is currently active and we cannot change its + * priority. + */ + tree = vdev_queue_class_tree(vq, zio->io_priority); + if (avl_find(tree, zio, NULL) == zio) { + avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio); + zio->io_priority = priority; + avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio); + } else if (avl_find(&vq->vq_active_tree, zio, NULL) != zio) { + zio->io_priority = priority; } mutex_exit(&vq->vq_lock); Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c =================================================================== --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c @@ -2584,6 +2584,44 @@ vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); } +/* + * Determine if any portion of the provided block resides on a child vdev + * with a dirty DTL and therefore needs to be resilvered. The function + * assumes that at least one DTL is dirty which imples that full stripe + * width blocks must be resilvered. + */ +static boolean_t +vdev_raidz_need_resilver(vdev_t *vd, uint64_t offset, size_t psize) +{ + uint64_t dcols = vd->vdev_children; + uint64_t nparity = vd->vdev_nparity; + uint64_t ashift = vd->vdev_top->vdev_ashift; + /* The starting RAIDZ (parent) vdev sector of the block. */ + uint64_t b = offset >> ashift; + /* The zio's size in units of the vdev's minimum sector size. */ + uint64_t s = ((psize - 1) >> ashift) + 1; + /* The first column for this stripe. */ + uint64_t f = b % dcols; + + if (s + nparity >= dcols) + return (B_TRUE); + + for (uint64_t c = 0; c < s + nparity; c++) { + uint64_t devidx = (f + c) % dcols; + vdev_t *cvd = vd->vdev_child[devidx]; + + /* + * dsl_scan_need_resilver() already checked vd with + * vdev_dtl_contains(). So here just check cvd with + * vdev_dtl_empty(), cheaper and a good approximation. + */ + if (!vdev_dtl_empty(cvd, DTL_PARTIAL)) + return (B_TRUE); + } + + return (B_FALSE); +} + vdev_ops_t vdev_raidz_ops = { vdev_raidz_open, vdev_raidz_close, @@ -2591,6 +2629,7 @@ vdev_raidz_io_start, vdev_raidz_io_done, vdev_raidz_state_change, + vdev_raidz_need_resilver, NULL, NULL, NULL, Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c =================================================================== --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c @@ -150,6 +150,7 @@ NULL, NULL, NULL, + NULL, VDEV_TYPE_ROOT, /* name of this vdev type */ B_FALSE /* not a leaf vdev */ }; Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c =================================================================== --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c @@ -1051,7 +1051,7 @@ } err = zap_add(os, intoobj, za.za_name, 8, 1, &value, tx); - if (err) + if (err != 0) break; } zap_cursor_fini(&zc); Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c =================================================================== --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c @@ -41,6 +41,7 @@ #include #include #include +#include #include #include @@ -438,6 +439,8 @@ { list_t *cl = &pio->io_child_list; + ASSERT(MUTEX_HELD(&pio->io_lock)); + *zl = (*zl == NULL) ? list_head(cl) : list_next(cl, *zl); if (*zl == NULL) return (NULL); @@ -472,8 +475,8 @@ zl->zl_parent = pio; zl->zl_child = cio; - mutex_enter(&cio->io_lock); mutex_enter(&pio->io_lock); + mutex_enter(&cio->io_lock); ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0); @@ -486,8 +489,8 @@ pio->io_child_count++; cio->io_parent_count++; - mutex_exit(&pio->io_lock); mutex_exit(&cio->io_lock); + mutex_exit(&pio->io_lock); } static void @@ -496,8 +499,8 @@ ASSERT(zl->zl_parent == pio); ASSERT(zl->zl_child == cio); - mutex_enter(&cio->io_lock); mutex_enter(&pio->io_lock); + mutex_enter(&cio->io_lock); list_remove(&pio->io_child_list, zl); list_remove(&cio->io_parent_list, zl); @@ -505,9 +508,8 @@ pio->io_child_count--; cio->io_parent_count--; - mutex_exit(&pio->io_lock); mutex_exit(&cio->io_lock); - + mutex_exit(&pio->io_lock); kmem_cache_free(zio_link_cache, zl); } @@ -988,6 +990,7 @@ metaslab_check_free(spa, bp); arc_freed(spa, bp); + dsl_scan_freed(spa, bp); if (zfs_trim_enabled) stage |= ZIO_STAGE_ISSUE_ASYNC | ZIO_STAGE_VDEV_IO_START | @@ -1865,14 +1868,16 @@ * cannot be affected by any side effects of reexecuting 'cio'. */ zio_link_t *zl = NULL; + mutex_enter(&pio->io_lock); for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) { cio_next = zio_walk_children(pio, &zl); - mutex_enter(&pio->io_lock); for (int w = 0; w < ZIO_WAIT_TYPES; w++) pio->io_children[cio->io_child_type][w]++; mutex_exit(&pio->io_lock); zio_reexecute(cio); + mutex_enter(&pio->io_lock); } + mutex_exit(&pio->io_lock); /* * Now that all children have been reexecuted, execute the parent. @@ -3184,26 +3189,25 @@ } } - /* - * We keep track of time-sensitive I/Os so that the scan thread - * can quickly react to certain workloads. In particular, we care - * about non-scrubbing, top-level reads and writes with the following - * characteristics: - * - synchronous writes of user data to non-slog devices - * - any reads of user data - * When these conditions are met, adjust the timestamp of spa_last_io - * which allows the scan thread to adjust its workload accordingly. - */ - if (!(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_bp != NULL && - vd == vd->vdev_top && !vd->vdev_islog && - zio->io_bookmark.zb_objset != DMU_META_OBJSET && - zio->io_txg != spa_syncing_txg(spa)) { - uint64_t old = spa->spa_last_io; - uint64_t new = ddi_get_lbolt64(); - if (old != new) - (void) atomic_cas_64(&spa->spa_last_io, old, new); - } - + /* + * We keep track of time-sensitive I/Os so that the scan thread + * can quickly react to certain workloads. In particular, we care + * about non-scrubbing, top-level reads and writes with the following + * characteristics: + * - synchronous writes of user data to non-slog devices + * - any reads of user data + * When these conditions are met, adjust the timestamp of spa_last_io + * which allows the scan thread to adjust its workload accordingly. + */ + if (!(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_bp != NULL && + vd == vd->vdev_top && !vd->vdev_islog && + zio->io_bookmark.zb_objset != DMU_META_OBJSET && + zio->io_txg != spa_syncing_txg(spa)) { + uint64_t old = spa->spa_last_io; + uint64_t new = ddi_get_lbolt64(); + if (old != new) + (void) atomic_cas_64(&spa->spa_last_io, old, new); + } align = 1ULL << vd->vdev_top->vdev_ashift; if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) && @@ -3350,6 +3354,35 @@ VERIFY(vdev_probe(vd, zio) == NULL); return (ZIO_PIPELINE_CONTINUE); +} + +/* + * This function is used to change the priority of an existing zio that is + * currently in-flight. This is used by the arc to upgrade priority in the + * event that a demand read is made for a block that is currently queued + * as a scrub or async read IO. Otherwise, the high priority read request + * would end up having to wait for the lower priority IO. + */ +void +zio_change_priority(zio_t *pio, zio_priority_t priority) +{ + zio_t *cio, *cio_next; + zio_link_t *zl = NULL; + + ASSERT3U(priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); + + if (pio->io_vd != NULL && pio->io_vd->vdev_ops->vdev_op_leaf) { + vdev_queue_change_io_priority(pio, priority); + } else { + pio->io_priority = priority; + } + + mutex_enter(&pio->io_lock); + for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) { + cio_next = zio_walk_children(pio, &zl); + zio_change_priority(cio, priority); + } + mutex_exit(&pio->io_lock); } /* Index: sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h =================================================================== --- sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h +++ sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h @@ -760,7 +760,7 @@ uint64_t pss_start_time; /* scan start time */ uint64_t pss_end_time; /* scan end time */ uint64_t pss_to_examine; /* total bytes to scan */ - uint64_t pss_examined; /* total examined bytes */ + uint64_t pss_examined; /* total bytes located by scanner */ uint64_t pss_to_process; /* total bytes to process */ uint64_t pss_processed; /* total processed bytes */ uint64_t pss_errors; /* scan errors */ @@ -771,6 +771,12 @@ uint64_t pss_pass_scrub_pause; /* pause time of a scurb pass */ /* cumulative time scrub spent paused, needed for rate calculation */ uint64_t pss_pass_scrub_spent_paused; + + /* Sorted scrubbing new fields */ + /* Stored on disk */ + uint64_t pss_issued; /* total bytes checked by scanner */ + /* Not stored on disk */ + uint64_t pss_pass_issued; /* issued bytes per scan pass */ } pool_scan_stat_t; typedef struct pool_removal_stat { Index: sys/cddl/contrib/opensolaris/uts/common/sys/taskq.h =================================================================== --- sys/cddl/contrib/opensolaris/uts/common/sys/taskq.h +++ sys/cddl/contrib/opensolaris/uts/common/sys/taskq.h @@ -72,6 +72,8 @@ #define TQ_NOALLOC 0x04 /* cannot allocate memory; may fail */ #define TQ_FRONT 0x08 /* Put task at the front of the queue */ +#define TASKQID_INVALID ((taskqid_t)0) + #ifdef _KERNEL extern taskq_t *system_taskq; @@ -91,6 +93,7 @@ void nulltask(void *); void taskq_destroy(taskq_t *); void taskq_wait(taskq_t *); +void taskq_wait_id(taskq_t *, taskqid_t); void taskq_suspend(taskq_t *); int taskq_suspended(taskq_t *); void taskq_resume(taskq_t *);