Page Menu
Home
FreeBSD
Search
Configure Global Search
Log In
Files
F132336679
D15562.id.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Flag For Later
Award Token
Size
193 KB
Referenced Files
None
Subscribers
None
D15562.id.diff
View Options
Index: head/cddl/contrib/opensolaris/cmd/zdb/zdb.c
===================================================================
--- head/cddl/contrib/opensolaris/cmd/zdb/zdb.c
+++ head/cddl/contrib/opensolaris/cmd/zdb/zdb.c
@@ -2281,14 +2281,14 @@
object_count++;
}
- ASSERT3U(object_count, ==, usedobjs);
-
(void) printf("\n");
if (error != ESRCH) {
(void) fprintf(stderr, "dmu_object_next() = %d\n", error);
abort();
}
+
+ ASSERT3U(object_count, ==, usedobjs);
}
static void
@@ -2788,6 +2788,7 @@
mutex_enter(&spa->spa_scrub_lock);
spa->spa_scrub_inflight--;
+ spa->spa_load_verify_ios--;
cv_broadcast(&spa->spa_scrub_io_cv);
if (ioerr && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
@@ -2859,9 +2860,10 @@
flags |= ZIO_FLAG_SPECULATIVE;
mutex_enter(&spa->spa_scrub_lock);
- while (spa->spa_scrub_inflight > max_inflight)
+ while (spa->spa_load_verify_ios > max_inflight)
cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
spa->spa_scrub_inflight++;
+ spa->spa_load_verify_ios++;
mutex_exit(&spa->spa_scrub_lock);
zio_nowait(zio_read(NULL, spa, bp, abd, size,
Index: head/cddl/contrib/opensolaris/cmd/zpool/zpool_main.c
===================================================================
--- head/cddl/contrib/opensolaris/cmd/zpool/zpool_main.c
+++ head/cddl/contrib/opensolaris/cmd/zpool/zpool_main.c
@@ -1643,7 +1643,7 @@
(void) nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_SCAN_STATS,
(uint64_t **)&ps, &c);
- if (ps && ps->pss_state == DSS_SCANNING &&
+ if (ps != NULL && ps->pss_state == DSS_SCANNING &&
vs->vs_scan_processed != 0 && children == 0) {
(void) printf(gettext(" (%s)"),
(ps->pss_func == POOL_SCAN_RESILVER) ?
@@ -4254,11 +4254,13 @@
print_scan_status(pool_scan_stat_t *ps)
{
time_t start, end, pause;
- uint64_t elapsed, mins_left, hours_left;
- uint64_t pass_exam, examined, total;
- uint_t rate;
+ uint64_t total_secs_left;
+ uint64_t elapsed, secs_left, mins_left, hours_left, days_left;
+ uint64_t pass_scanned, scanned, pass_issued, issued, total;
+ uint_t scan_rate, issue_rate;
double fraction_done;
- char processed_buf[7], examined_buf[7], total_buf[7], rate_buf[7];
+ char processed_buf[7], scanned_buf[7], issued_buf[7], total_buf[7];
+ char srate_buf[7], irate_buf[7];
(void) printf(gettext(" scan: "));
@@ -4272,30 +4274,37 @@
start = ps->pss_start_time;
end = ps->pss_end_time;
pause = ps->pss_pass_scrub_pause;
+
zfs_nicenum(ps->pss_processed, processed_buf, sizeof (processed_buf));
assert(ps->pss_func == POOL_SCAN_SCRUB ||
ps->pss_func == POOL_SCAN_RESILVER);
- /*
- * Scan is finished or canceled.
- */
- if (ps->pss_state == DSS_FINISHED) {
- uint64_t minutes_taken = (end - start) / 60;
- char *fmt = NULL;
+ /* Scan is finished or canceled. */
+ if (ps->pss_state == DSS_FINISHED) {
+ total_secs_left = end - start;
+ days_left = total_secs_left / 60 / 60 / 24;
+ hours_left = (total_secs_left / 60 / 60) % 24;
+ mins_left = (total_secs_left / 60) % 60;
+ secs_left = (total_secs_left % 60);
+
if (ps->pss_func == POOL_SCAN_SCRUB) {
- fmt = gettext("scrub repaired %s in %lluh%um with "
- "%llu errors on %s");
+ (void) printf(gettext("scrub repaired %s "
+ "in %llu days %02llu:%02llu:%02llu "
+ "with %llu errors on %s"), processed_buf,
+ (u_longlong_t)days_left, (u_longlong_t)hours_left,
+ (u_longlong_t)mins_left, (u_longlong_t)secs_left,
+ (u_longlong_t)ps->pss_errors, ctime(&end));
} else if (ps->pss_func == POOL_SCAN_RESILVER) {
- fmt = gettext("resilvered %s in %lluh%um with "
- "%llu errors on %s");
+ (void) printf(gettext("resilvered %s "
+ "in %llu days %02llu:%02llu:%02llu "
+ "with %llu errors on %s"), processed_buf,
+ (u_longlong_t)days_left, (u_longlong_t)hours_left,
+ (u_longlong_t)mins_left, (u_longlong_t)secs_left,
+ (u_longlong_t)ps->pss_errors, ctime(&end));
+
}
- /* LINTED */
- (void) printf(fmt, processed_buf,
- (u_longlong_t)(minutes_taken / 60),
- (uint_t)(minutes_taken % 60),
- (u_longlong_t)ps->pss_errors,
- ctime((time_t *)&end));
+
return;
} else if (ps->pss_state == DSS_CANCELED) {
if (ps->pss_func == POOL_SCAN_SCRUB) {
@@ -4310,19 +4319,15 @@
assert(ps->pss_state == DSS_SCANNING);
- /*
- * Scan is in progress.
- */
+ /* Scan is in progress. Resilvers can't be paused. */
if (ps->pss_func == POOL_SCAN_SCRUB) {
if (pause == 0) {
(void) printf(gettext("scrub in progress since %s"),
ctime(&start));
} else {
- char buf[32];
- struct tm *p = localtime(&pause);
- (void) strftime(buf, sizeof (buf), "%a %b %e %T %Y", p);
- (void) printf(gettext("scrub paused since %s\n"), buf);
- (void) printf(gettext("\tscrub started on %s"),
+ (void) printf(gettext("scrub paused since %s"),
+ ctime(&pause));
+ (void) printf(gettext("\tscrub started on %s"),
ctime(&start));
}
} else if (ps->pss_func == POOL_SCAN_RESILVER) {
@@ -4330,49 +4335,67 @@
ctime(&start));
}
- examined = ps->pss_examined ? ps->pss_examined : 1;
+ scanned = ps->pss_examined;
+ pass_scanned = ps->pss_pass_exam;
+ issued = ps->pss_issued;
+ pass_issued = ps->pss_pass_issued;
total = ps->pss_to_examine;
- fraction_done = (double)examined / total;
- /* elapsed time for this pass */
+ /* we are only done with a block once we have issued the IO for it */
+ fraction_done = (double)issued / total;
+
+ /* elapsed time for this pass, rounding up to 1 if it's 0 */
elapsed = time(NULL) - ps->pss_pass_start;
elapsed -= ps->pss_pass_scrub_spent_paused;
- elapsed = elapsed ? elapsed : 1;
- pass_exam = ps->pss_pass_exam ? ps->pss_pass_exam : 1;
- rate = pass_exam / elapsed;
- rate = rate ? rate : 1;
- mins_left = ((total - examined) / rate) / 60;
- hours_left = mins_left / 60;
+ elapsed = (elapsed != 0) ? elapsed : 1;
- zfs_nicenum(examined, examined_buf, sizeof (examined_buf));
+ scan_rate = pass_scanned / elapsed;
+ issue_rate = pass_issued / elapsed;
+ total_secs_left = (issue_rate != 0) ?
+ ((total - issued) / issue_rate) : UINT64_MAX;
+
+ days_left = total_secs_left / 60 / 60 / 24;
+ hours_left = (total_secs_left / 60 / 60) % 24;
+ mins_left = (total_secs_left / 60) % 60;
+ secs_left = (total_secs_left % 60);
+
+ /* format all of the numbers we will be reporting */
+ zfs_nicenum(scanned, scanned_buf, sizeof (scanned_buf));
+ zfs_nicenum(issued, issued_buf, sizeof (issued_buf));
zfs_nicenum(total, total_buf, sizeof (total_buf));
+ zfs_nicenum(scan_rate, srate_buf, sizeof (srate_buf));
+ zfs_nicenum(issue_rate, irate_buf, sizeof (irate_buf));
- /*
- * do not print estimated time if hours_left is more than 30 days
- * or we have a paused scrub
- */
+ /* doo not print estimated time if we have a paused scrub */
if (pause == 0) {
- zfs_nicenum(rate, rate_buf, sizeof (rate_buf));
- (void) printf(gettext("\t%s scanned out of %s at %s/s"),
- examined_buf, total_buf, rate_buf);
- if (hours_left < (30 * 24)) {
- (void) printf(gettext(", %lluh%um to go\n"),
- (u_longlong_t)hours_left, (uint_t)(mins_left % 60));
- } else {
- (void) printf(gettext(
- ", (scan is slow, no estimated time)\n"));
- }
+ (void) printf(gettext("\t%s scanned at %s/s, "
+ "%s issued at %s/s, %s total\n"),
+ scanned_buf, srate_buf, issued_buf, irate_buf, total_buf);
} else {
- (void) printf(gettext("\t%s scanned out of %s\n"),
- examined_buf, total_buf);
+ (void) printf(gettext("\t%s scanned, %s issued, %s total\n"),
+ scanned_buf, issued_buf, total_buf);
}
if (ps->pss_func == POOL_SCAN_RESILVER) {
- (void) printf(gettext(" %s resilvered, %.2f%% done\n"),
+ (void) printf(gettext("\t%s resilvered, %.2f%% done"),
processed_buf, 100 * fraction_done);
} else if (ps->pss_func == POOL_SCAN_SCRUB) {
- (void) printf(gettext(" %s repaired, %.2f%% done\n"),
+ (void) printf(gettext("\t%s repaired, %.2f%% done"),
processed_buf, 100 * fraction_done);
+ }
+
+ if (pause == 0) {
+ if (issue_rate >= 10 * 1024 * 1024) {
+ (void) printf(gettext(", %llu days "
+ "%02llu:%02llu:%02llu to go\n"),
+ (u_longlong_t)days_left, (u_longlong_t)hours_left,
+ (u_longlong_t)mins_left, (u_longlong_t)secs_left);
+ } else {
+ (void) printf(gettext(", no estimated "
+ "completion time\n"));
+ }
+ } else {
+ (void) printf(gettext("\n"));
}
}
Index: head/cddl/contrib/opensolaris/cmd/ztest/ztest.c
===================================================================
--- head/cddl/contrib/opensolaris/cmd/ztest/ztest.c
+++ head/cddl/contrib/opensolaris/cmd/ztest/ztest.c
@@ -374,15 +374,15 @@
{ ztest_fzap, 1, &zopt_sometimes },
{ ztest_dmu_snapshot_create_destroy, 1, &zopt_sometimes },
{ ztest_spa_create_destroy, 1, &zopt_sometimes },
- { ztest_fault_inject, 1, &zopt_sometimes },
+ { ztest_fault_inject, 1, &zopt_incessant },
{ ztest_ddt_repair, 1, &zopt_sometimes },
{ ztest_dmu_snapshot_hold, 1, &zopt_sometimes },
{ ztest_reguid, 1, &zopt_rarely },
{ ztest_spa_rename, 1, &zopt_rarely },
- { ztest_scrub, 1, &zopt_rarely },
+ { ztest_scrub, 1, &zopt_often },
{ ztest_spa_upgrade, 1, &zopt_rarely },
{ ztest_dsl_dataset_promote_busy, 1, &zopt_rarely },
- { ztest_vdev_attach_detach, 1, &zopt_sometimes },
+ { ztest_vdev_attach_detach, 1, &zopt_incessant },
{ ztest_vdev_LUN_growth, 1, &zopt_rarely },
{ ztest_vdev_add_remove, 1,
&ztest_opts.zo_vdevtime },
Index: head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_status.c
===================================================================
--- head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_status.c
+++ head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_status.c
@@ -219,7 +219,7 @@
*/
(void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_SCAN_STATS,
(uint64_t **)&ps, &psc);
- if (ps && ps->pss_func == POOL_SCAN_RESILVER &&
+ if (ps != NULL && ps->pss_func == POOL_SCAN_RESILVER &&
ps->pss_state == DSS_SCANNING)
return (ZPOOL_STATUS_RESILVERING);
Index: head/cddl/contrib/opensolaris/lib/libzpool/common/sys/zfs_context.h
===================================================================
--- head/cddl/contrib/opensolaris/lib/libzpool/common/sys/zfs_context.h
+++ head/cddl/contrib/opensolaris/lib/libzpool/common/sys/zfs_context.h
@@ -408,6 +408,7 @@
#define TQ_NOQUEUE 0x02 /* Do not enqueue if can't dispatch */
#define TQ_FRONT 0x08 /* Queue in front */
+#define TASKQID_INVALID ((taskqid_t)0)
extern taskq_t *system_taskq;
@@ -421,6 +422,7 @@
taskq_ent_t *);
extern void taskq_destroy(taskq_t *);
extern void taskq_wait(taskq_t *);
+extern void taskq_wait_id(taskq_t *, taskqid_t);
extern int taskq_member(taskq_t *, void *);
extern void system_taskq_init(void);
extern void system_taskq_fini(void);
Index: head/cddl/contrib/opensolaris/lib/libzpool/common/taskq.c
===================================================================
--- head/cddl/contrib/opensolaris/lib/libzpool/common/taskq.c
+++ head/cddl/contrib/opensolaris/lib/libzpool/common/taskq.c
@@ -187,6 +187,12 @@
mutex_exit(&tq->tq_lock);
}
+void
+taskq_wait_id(taskq_t *tq, taskqid_t id)
+{
+ taskq_wait(tq);
+}
+
static void *
taskq_thread(void *arg)
{
Index: head/sys/cddl/compat/opensolaris/kern/opensolaris_taskq.c
===================================================================
--- head/sys/cddl/compat/opensolaris/kern/opensolaris_taskq.c
+++ head/sys/cddl/compat/opensolaris/kern/opensolaris_taskq.c
@@ -173,3 +173,9 @@
{
taskqueue_drain_all(tq->tq_queue);
}
+
+void
+taskq_wait_id(taskq_t *tq, taskqid_t id)
+{
+ taskq_wait(tq);
+}
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
@@ -339,7 +339,8 @@
* minimum lifespan of a prefetch block in clock ticks
* (initialized in arc_init())
*/
-static int arc_min_prefetch_lifespan;
+static int zfs_arc_min_prefetch_ms = 1;
+static int zfs_arc_min_prescient_prefetch_ms = 6;
/*
* If this percent of memory is free, don't throttle.
@@ -779,8 +780,9 @@
kstat_named_t arcstat_meta_limit;
kstat_named_t arcstat_meta_max;
kstat_named_t arcstat_meta_min;
- kstat_named_t arcstat_sync_wait_for_async;
+ kstat_named_t arcstat_async_upgrade_sync;
kstat_named_t arcstat_demand_hit_predictive_prefetch;
+ kstat_named_t arcstat_demand_hit_prescient_prefetch;
} arc_stats_t;
static arc_stats_t arc_stats = {
@@ -877,8 +879,9 @@
{ "arc_meta_limit", KSTAT_DATA_UINT64 },
{ "arc_meta_max", KSTAT_DATA_UINT64 },
{ "arc_meta_min", KSTAT_DATA_UINT64 },
- { "sync_wait_for_async", KSTAT_DATA_UINT64 },
+ { "async_upgrade_sync", KSTAT_DATA_UINT64 },
{ "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 },
+ { "demand_hit_prescient_prefetch", KSTAT_DATA_UINT64 },
};
#define ARCSTAT(stat) (arc_stats.stat.value.ui64)
@@ -974,22 +977,23 @@
struct arc_callback {
void *acb_private;
- arc_done_func_t *acb_done;
+ arc_read_done_func_t *acb_done;
arc_buf_t *acb_buf;
boolean_t acb_compressed;
zio_t *acb_zio_dummy;
+ zio_t *acb_zio_head;
arc_callback_t *acb_next;
};
typedef struct arc_write_callback arc_write_callback_t;
struct arc_write_callback {
- void *awcb_private;
- arc_done_func_t *awcb_ready;
- arc_done_func_t *awcb_children_ready;
- arc_done_func_t *awcb_physdone;
- arc_done_func_t *awcb_done;
- arc_buf_t *awcb_buf;
+ void *awcb_private;
+ arc_write_done_func_t *awcb_ready;
+ arc_write_done_func_t *awcb_children_ready;
+ arc_write_done_func_t *awcb_physdone;
+ arc_write_done_func_t *awcb_done;
+ arc_buf_t *awcb_buf;
};
/*
@@ -1229,6 +1233,8 @@
#define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS)
#define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_FLAG_IO_ERROR)
#define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_FLAG_PREFETCH)
+#define HDR_PRESCIENT_PREFETCH(hdr) \
+ ((hdr)->b_flags & ARC_FLAG_PRESCIENT_PREFETCH)
#define HDR_COMPRESSION_ENABLED(hdr) \
((hdr)->b_flags & ARC_FLAG_COMPRESSED_ARC)
@@ -1392,6 +1398,11 @@
SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD,
&ARC_l2c_only.arcs_size.rc_count, 0, "size of mru state");
+SYSCTL_UINT(_vfs_zfs, OID_AUTO, arc_min_prfetch_ms, CTLFLAG_RW,
+ &zfs_arc_min_prefetch_ms, 0, "Min life of prefetch block in ms");
+SYSCTL_UINT(_vfs_zfs, OID_AUTO, arc_min_prescient_prefetch_ms, CTLFLAG_RW,
+ &zfs_arc_min_prescient_prefetch_ms, 0, "Min life oof prescient prefetched block in ms");
+
/*
* L2ARC Internals
*/
@@ -3544,6 +3555,8 @@
{
arc_state_t *evicted_state, *state;
int64_t bytes_evicted = 0;
+ int min_lifetime = HDR_PRESCIENT_PREFETCH(hdr) ?
+ zfs_arc_min_prescient_prefetch_ms : zfs_arc_min_prefetch_ms;
ASSERT(MUTEX_HELD(hash_lock));
ASSERT(HDR_HAS_L1HDR(hdr));
@@ -3596,8 +3609,7 @@
/* prefetch buffers have a minimum lifespan */
if (HDR_IO_IN_PROGRESS(hdr) ||
((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) &&
- ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access <
- arc_min_prefetch_lifespan)) {
+ ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access < min_lifetime * hz)) {
ARCSTAT_BUMP(arcstat_evict_skip);
return (bytes_evicted);
}
@@ -4968,13 +4980,15 @@
* - move the buffer to the head of the list if this is
* another prefetch (to make it less likely to be evicted).
*/
- if (HDR_PREFETCH(hdr)) {
+ if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) {
if (refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
/* link protected by hash lock */
ASSERT(multilist_link_active(
&hdr->b_l1hdr.b_arc_node));
} else {
- arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH);
+ arc_hdr_clear_flags(hdr,
+ ARC_FLAG_PREFETCH |
+ ARC_FLAG_PRESCIENT_PREFETCH);
ARCSTAT_BUMP(arcstat_mru_hits);
}
hdr->b_l1hdr.b_arc_access = now;
@@ -5005,10 +5019,13 @@
* MFU state.
*/
- if (HDR_PREFETCH(hdr)) {
+ if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) {
new_state = arc_mru;
- if (refcount_count(&hdr->b_l1hdr.b_refcnt) > 0)
- arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH);
+ if (refcount_count(&hdr->b_l1hdr.b_refcnt) > 0) {
+ arc_hdr_clear_flags(hdr,
+ ARC_FLAG_PREFETCH |
+ ARC_FLAG_PRESCIENT_PREFETCH);
+ }
DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
} else {
new_state = arc_mfu;
@@ -5029,11 +5046,7 @@
* If it was a prefetch, we will explicitly move it to
* the head of the list now.
*/
- if ((HDR_PREFETCH(hdr)) != 0) {
- ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
- /* link protected by hash_lock */
- ASSERT(multilist_link_active(&hdr->b_l1hdr.b_arc_node));
- }
+
ARCSTAT_BUMP(arcstat_mfu_hits);
hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
} else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) {
@@ -5044,12 +5057,11 @@
* MFU state.
*/
- if (HDR_PREFETCH(hdr)) {
+ if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) {
/*
* This is a prefetch access...
* move this block back to the MRU state.
*/
- ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt));
new_state = arc_mru;
}
@@ -5116,23 +5128,28 @@
demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data, metadata, hits);
}
-/* a generic arc_done_func_t which you can use */
+/* a generic arc_read_done_func_t which you can use */
/* ARGSUSED */
void
-arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
+arc_bcopy_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
+ arc_buf_t *buf, void *arg)
{
- if (zio == NULL || zio->io_error == 0)
- bcopy(buf->b_data, arg, arc_buf_size(buf));
+ if (buf == NULL)
+ return;
+
+ bcopy(buf->b_data, arg, arc_buf_size(buf));
arc_buf_destroy(buf, arg);
}
-/* a generic arc_done_func_t */
+/* a generic arc_read_done_func_t */
+/* ARGSUSED */
void
-arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
+arc_getbuf_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
+ arc_buf_t *buf, void *arg)
{
arc_buf_t **bufp = arg;
- if (zio && zio->io_error) {
- arc_buf_destroy(buf, arg);
+
+ if (buf == NULL) {
*bufp = NULL;
} else {
*bufp = buf;
@@ -5164,7 +5181,6 @@
arc_callback_t *callback_list;
arc_callback_t *acb;
boolean_t freeable = B_FALSE;
- boolean_t no_zio_error = (zio->io_error == 0);
/*
* The hdr was inserted into hash-table and removed from lists
@@ -5190,7 +5206,7 @@
ASSERT3P(hash_lock, !=, NULL);
}
- if (no_zio_error) {
+ if (zio->io_error == 0) {
/* byteswap if necessary */
if (BP_SHOULD_BYTESWAP(zio->io_bp)) {
if (BP_GET_LEVEL(zio->io_bp) > 0) {
@@ -5211,7 +5227,8 @@
callback_list = hdr->b_l1hdr.b_acb;
ASSERT3P(callback_list, !=, NULL);
- if (hash_lock && no_zio_error && hdr->b_l1hdr.b_state == arc_anon) {
+ if (hash_lock && zio->io_error == 0 &&
+ hdr->b_l1hdr.b_state == arc_anon) {
/*
* Only call arc_access on anonymous buffers. This is because
* if we've issued an I/O for an evicted buffer, we've already
@@ -5232,14 +5249,21 @@
if (!acb->acb_done)
continue;
- /* This is a demand read since prefetches don't use callbacks */
callback_cnt++;
+ if (zio->io_error != 0)
+ continue;
+
int error = arc_buf_alloc_impl(hdr, acb->acb_private,
- acb->acb_compressed, no_zio_error, &acb->acb_buf);
- if (no_zio_error) {
- zio->io_error = error;
+ acb->acb_compressed,
+ B_TRUE, &acb->acb_buf);
+ if (error != 0) {
+ arc_buf_destroy(acb->acb_buf, acb->acb_private);
+ acb->acb_buf = NULL;
}
+
+ if (zio->io_error == 0)
+ zio->io_error = error;
}
hdr->b_l1hdr.b_acb = NULL;
arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
@@ -5252,7 +5276,7 @@
ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt) ||
callback_list != NULL);
- if (no_zio_error) {
+ if (zio->io_error == 0) {
arc_hdr_verify(hdr, zio->io_bp);
} else {
arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR);
@@ -5285,8 +5309,10 @@
/* execute each callback and free its structure */
while ((acb = callback_list) != NULL) {
- if (acb->acb_done)
- acb->acb_done(zio, acb->acb_buf, acb->acb_private);
+ if (acb->acb_done) {
+ acb->acb_done(zio, &zio->io_bookmark, zio->io_bp,
+ acb->acb_buf, acb->acb_private);
+ }
if (acb->acb_zio_dummy != NULL) {
acb->acb_zio_dummy->io_error = zio->io_error;
@@ -5320,7 +5346,7 @@
* for readers of this block.
*/
int
-arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
+arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_read_done_func_t *done,
void *private, zio_priority_t priority, int zio_flags,
arc_flags_t *arc_flags, const zbookmark_phys_t *zb)
{
@@ -5329,7 +5355,8 @@
zio_t *rzio;
uint64_t guid = spa_load_guid(spa);
boolean_t compressed_read = (zio_flags & ZIO_FLAG_RAW) != 0;
-
+ int rc = 0;
+
ASSERT(!BP_IS_EMBEDDED(bp) ||
BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA);
@@ -5347,32 +5374,20 @@
*arc_flags |= ARC_FLAG_CACHED;
if (HDR_IO_IN_PROGRESS(hdr)) {
+ zio_t *head_zio = hdr->b_l1hdr.b_acb->acb_zio_head;
+ ASSERT3P(head_zio, !=, NULL);
if ((hdr->b_flags & ARC_FLAG_PRIO_ASYNC_READ) &&
priority == ZIO_PRIORITY_SYNC_READ) {
/*
- * This sync read must wait for an
- * in-progress async read (e.g. a predictive
- * prefetch). Async reads are queued
- * separately at the vdev_queue layer, so
- * this is a form of priority inversion.
- * Ideally, we would "inherit" the demand
- * i/o's priority by moving the i/o from
- * the async queue to the synchronous queue,
- * but there is currently no mechanism to do
- * so. Track this so that we can evaluate
- * the magnitude of this potential performance
- * problem.
- *
- * Note that if the prefetch i/o is already
- * active (has been issued to the device),
- * the prefetch improved performance, because
- * we issued it sooner than we would have
- * without the prefetch.
+ * This is a sync read that needs to wait for
+ * an in-flight async read. Request that the
+ * zio have its priority upgraded.
*/
- DTRACE_PROBE1(arc__sync__wait__for__async,
+ zio_change_priority(head_zio, priority);
+ DTRACE_PROBE1(arc__async__upgrade__sync,
arc_buf_hdr_t *, hdr);
- ARCSTAT_BUMP(arcstat_sync_wait_for_async);
+ ARCSTAT_BUMP(arcstat_async_upgrade_sync);
}
if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) {
arc_hdr_clear_flags(hdr,
@@ -5399,6 +5414,7 @@
spa, NULL, NULL, NULL, zio_flags);
ASSERT3P(acb->acb_done, !=, NULL);
+ acb->acb_zio_head = head_zio;
acb->acb_next = hdr->b_l1hdr.b_acb;
hdr->b_l1hdr.b_acb = acb;
mutex_exit(hash_lock);
@@ -5426,17 +5442,32 @@
arc_hdr_clear_flags(hdr,
ARC_FLAG_PREDICTIVE_PREFETCH);
}
- ASSERT(!BP_IS_EMBEDDED(bp) || !BP_IS_HOLE(bp));
+ if (hdr->b_flags & ARC_FLAG_PRESCIENT_PREFETCH) {
+ ARCSTAT_BUMP(
+ arcstat_demand_hit_prescient_prefetch);
+ arc_hdr_clear_flags(hdr,
+ ARC_FLAG_PRESCIENT_PREFETCH);
+ }
+
+ ASSERT(!BP_IS_EMBEDDED(bp) || !BP_IS_HOLE(bp));
/* Get a buf with the desired data in it. */
- VERIFY0(arc_buf_alloc_impl(hdr, private,
- compressed_read, B_TRUE, &buf));
+ rc = arc_buf_alloc_impl(hdr, private,
+ compressed_read, B_TRUE, &buf);
+ if (rc != 0) {
+ arc_buf_destroy(buf, private);
+ buf = NULL;
+ }
+ ASSERT((zio_flags & ZIO_FLAG_SPECULATIVE) ||
+ rc == 0 || rc != ENOENT);
} else if (*arc_flags & ARC_FLAG_PREFETCH &&
refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH);
}
DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
arc_access(hdr, hash_lock);
+ if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH)
+ arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH);
if (*arc_flags & ARC_FLAG_L2CACHE)
arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
mutex_exit(hash_lock);
@@ -5446,7 +5477,7 @@
data, metadata, hits);
if (done)
- done(NULL, buf, private);
+ done(NULL, zb, bp, buf, private);
} else {
uint64_t lsize = BP_GET_LSIZE(bp);
uint64_t psize = BP_GET_PSIZE(bp);
@@ -5520,6 +5551,9 @@
if (*arc_flags & ARC_FLAG_PREFETCH)
arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH);
+ if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH)
+ arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH);
+
if (*arc_flags & ARC_FLAG_L2CACHE)
arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
if (BP_GET_LEVEL(bp) > 0)
@@ -5549,14 +5583,17 @@
vd = NULL;
}
- if (priority == ZIO_PRIORITY_ASYNC_READ)
+ /*
+ * We count both async reads and scrub IOs as asynchronous so
+ * that both can be upgraded in the event of a cache hit while
+ * the read IO is still in-flight.
+ */
+ if (priority == ZIO_PRIORITY_ASYNC_READ ||
+ priority == ZIO_PRIORITY_SCRUB)
arc_hdr_set_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ);
else
arc_hdr_clear_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ);
- if (hash_lock != NULL)
- mutex_exit(hash_lock);
-
/*
* At this point, we have a level 1 cache miss. Try again in
* L2ARC if possible.
@@ -5637,6 +5674,11 @@
ZIO_FLAG_CANFAIL |
ZIO_FLAG_DONT_PROPAGATE |
ZIO_FLAG_DONT_RETRY, B_FALSE);
+ acb->acb_zio_head = rzio;
+
+ if (hash_lock != NULL)
+ mutex_exit(hash_lock);
+
DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
zio_t *, rzio);
ARCSTAT_INCR(arcstat_l2_read_bytes, size);
@@ -5651,6 +5693,8 @@
return (0);
/* l2arc read error; goto zio_read() */
+ if (hash_lock != NULL)
+ mutex_enter(hash_lock);
} else {
DTRACE_PROBE1(l2arc__miss,
arc_buf_hdr_t *, hdr);
@@ -5671,7 +5715,11 @@
rzio = zio_read(pio, spa, bp, hdr->b_l1hdr.b_pabd, size,
arc_read_done, hdr, priority, zio_flags, zb);
+ acb->acb_zio_head = rzio;
+ if (hash_lock != NULL)
+ mutex_exit(hash_lock);
+
if (*arc_flags & ARC_FLAG_WAIT)
return (zio_wait(rzio));
@@ -6162,9 +6210,9 @@
zio_t *
arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
- boolean_t l2arc, const zio_prop_t *zp, arc_done_func_t *ready,
- arc_done_func_t *children_ready, arc_done_func_t *physdone,
- arc_done_func_t *done, void *private, zio_priority_t priority,
+ boolean_t l2arc, const zio_prop_t *zp, arc_write_done_func_t *ready,
+ arc_write_done_func_t *children_ready, arc_write_done_func_t *physdone,
+ arc_write_done_func_t *done, void *private, zio_priority_t priority,
int zio_flags, const zbookmark_phys_t *zb)
{
arc_buf_hdr_t *hdr = buf->b_hdr;
@@ -6590,9 +6638,6 @@
mutex_init(&arc_dnlc_evicts_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&arc_dnlc_evicts_cv, NULL, CV_DEFAULT, NULL);
-
- /* Convert seconds to clock ticks */
- arc_min_prefetch_lifespan = 1 * hz;
/* set min cache to 1/32 of all memory, or arc_abs_min, whichever is more */
arc_c_min = MAX(allmem / 32, arc_abs_min);
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
@@ -902,7 +902,8 @@
}
static void
-dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
+dbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
+ arc_buf_t *buf, void *vdb)
{
dmu_buf_impl_t *db = vdb;
@@ -916,19 +917,22 @@
ASSERT(db->db.db_data == NULL);
if (db->db_level == 0 && db->db_freed_in_flight) {
/* we were freed in flight; disregard any error */
+ if (buf == NULL) {
+ buf = arc_alloc_buf(db->db_objset->os_spa,
+ db, DBUF_GET_BUFC_TYPE(db), db->db.db_size);
+ }
arc_release(buf, db);
bzero(buf->b_data, db->db.db_size);
arc_buf_freeze(buf);
db->db_freed_in_flight = FALSE;
dbuf_set_data(db, buf);
db->db_state = DB_CACHED;
- } else if (zio == NULL || zio->io_error == 0) {
+ } else if (buf != NULL) {
dbuf_set_data(db, buf);
db->db_state = DB_CACHED;
} else {
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
ASSERT3P(db->db_buf, ==, NULL);
- arc_buf_destroy(buf, db);
db->db_state = DB_UNCACHED;
}
cv_broadcast(&db->db_changed);
@@ -2326,7 +2330,8 @@
* prefetch if the next block down is our target.
*/
static void
-dbuf_prefetch_indirect_done(zio_t *zio, arc_buf_t *abuf, void *private)
+dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb,
+ const blkptr_t *iobp, arc_buf_t *abuf, void *private)
{
dbuf_prefetch_arg_t *dpa = private;
@@ -2365,13 +2370,18 @@
dbuf_rele(db, FTAG);
}
+ if (abuf == NULL) {
+ kmem_free(dpa, sizeof(*dpa));
+ return;
+ }
+
dpa->dpa_curlevel--;
uint64_t nextblkid = dpa->dpa_zb.zb_blkid >>
(dpa->dpa_epbs * (dpa->dpa_curlevel - dpa->dpa_zb.zb_level));
blkptr_t *bp = ((blkptr_t *)abuf->b_data) +
P2PHASE(nextblkid, 1ULL << dpa->dpa_epbs);
- if (BP_IS_HOLE(bp) || (zio != NULL && zio->io_error != 0)) {
+ if (BP_IS_HOLE(bp)) {
kmem_free(dpa, sizeof (*dpa));
} else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) {
ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid);
@@ -3746,7 +3756,7 @@
* ready callback so that we can properly handle an indirect
* block that only contains holes.
*/
- arc_done_func_t *children_ready_cb = NULL;
+ arc_write_done_func_t *children_ready_cb = NULL;
if (db->db_level != 0)
children_ready_cb = dbuf_write_children_ready;
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt.c
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt.c
@@ -1112,14 +1112,26 @@
void
ddt_sync(spa_t *spa, uint64_t txg)
{
+ dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
dmu_tx_t *tx;
- zio_t *rio = zio_root(spa, NULL, NULL,
- ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SELF_HEAL);
+ zio_t *rio;
ASSERT(spa_syncing_txg(spa) == txg);
tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
+ rio = zio_root(spa, NULL, NULL,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SELF_HEAL);
+
+ /*
+ * This function may cause an immediate scan of ddt blocks (see
+ * the comment above dsl_scan_ddt() for details). We set the
+ * scan's root zio here so that we can wait for any scan IOs in
+ * addition to the regular ddt IOs.
+ */
+ ASSERT3P(scn->scn_zio_root, ==, NULL);
+ scn->scn_zio_root = rio;
+
for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
ddt_t *ddt = spa->spa_ddt[c];
if (ddt == NULL)
@@ -1129,6 +1141,7 @@
}
(void) zio_wait(rio);
+ scn->scn_zio_root = NULL;
dmu_tx_commit(tx);
}
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c
@@ -349,6 +349,7 @@
ASSERT(ds == NULL || MUTEX_HELD(&ds->ds_opening_lock));
+#if 0
/*
* The $ORIGIN dataset (if it exists) doesn't have an associated
* objset, so there's no reason to open it. The $ORIGIN dataset
@@ -359,6 +360,7 @@
ASSERT3P(ds->ds_dir, !=,
spa_get_dsl(spa)->dp_origin_snap->ds_dir);
}
+#endif
os = kmem_zalloc(sizeof (objset_t), KM_SLEEP);
os->os_dsl_dataset = ds;
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c
@@ -499,8 +499,9 @@
const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
{
prefetch_data_t *pfd = arg;
- arc_flags_t aflags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
-
+ arc_flags_t aflags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH |
+ ARC_FLAG_PRESCIENT_PREFETCH;
+
ASSERT(pfd->pd_bytes_fetched >= 0);
if (bp == NULL)
return (0);
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c
@@ -51,28 +51,136 @@
#include <sys/sa_impl.h>
#include <sys/zfeature.h>
#include <sys/abd.h>
+#include <sys/range_tree.h>
#ifdef _KERNEL
#include <sys/zfs_vfsops.h>
#endif
+/*
+ * Grand theory statement on scan queue sorting
+ *
+ * Scanning is implemented by recursively traversing all indirection levels
+ * in an object and reading all blocks referenced from said objects. This
+ * results in us approximately traversing the object from lowest logical
+ * offset to the highest. For best performance, we would want the logical
+ * blocks to be physically contiguous. However, this is frequently not the
+ * case with pools given the allocation patterns of copy-on-write filesystems.
+ * So instead, we put the I/Os into a reordering queue and issue them in a
+ * way that will most benefit physical disks (LBA-order).
+ *
+ * Queue management:
+ *
+ * Ideally, we would want to scan all metadata and queue up all block I/O
+ * prior to starting to issue it, because that allows us to do an optimal
+ * sorting job. This can however consume large amounts of memory. Therefore
+ * we continuously monitor the size of the queues and constrain them to 5%
+ * (zfs_scan_mem_lim_fact) of physmem. If the queues grow larger than this
+ * limit, we clear out a few of the largest extents at the head of the queues
+ * to make room for more scanning. Hopefully, these extents will be fairly
+ * large and contiguous, allowing us to approach sequential I/O throughput
+ * even without a fully sorted tree.
+ *
+ * Metadata scanning takes place in dsl_scan_visit(), which is called from
+ * dsl_scan_sync() every spa_sync(). If we have either fully scanned all
+ * metadata on the pool, or we need to make room in memory because our
+ * queues are too large, dsl_scan_visit() is postponed and
+ * scan_io_queues_run() is called from dsl_scan_sync() instead. This implies
+ * that metadata scanning and queued I/O issuing are mutually exclusive. This
+ * allows us to provide maximum sequential I/O throughput for the majority of
+ * I/O's issued since sequential I/O performance is significantly negatively
+ * impacted if it is interleaved with random I/O.
+ *
+ * Implementation Notes
+ *
+ * One side effect of the queued scanning algorithm is that the scanning code
+ * needs to be notified whenever a block is freed. This is needed to allow
+ * the scanning code to remove these I/Os from the issuing queue. Additionally,
+ * we do not attempt to queue gang blocks to be issued sequentially since this
+ * is very hard to do and would have an extremely limitted performance benefit.
+ * Instead, we simply issue gang I/Os as soon as we find them using the legacy
+ * algorithm.
+ *
+ * Backwards compatibility
+ *
+ * This new algorithm is backwards compatible with the legacy on-disk data
+ * structures (and therefore does not require a new feature flag).
+ * Periodically during scanning (see zfs_scan_checkpoint_intval), the scan
+ * will stop scanning metadata (in logical order) and wait for all outstanding
+ * sorted I/O to complete. Once this is done, we write out a checkpoint
+ * bookmark, indicating that we have scanned everything logically before it.
+ * If the pool is imported on a machine without the new sorting algorithm,
+ * the scan simply resumes from the last checkpoint using the legacy algorithm.
+ */
+
typedef int (scan_cb_t)(dsl_pool_t *, const blkptr_t *,
const zbookmark_phys_t *);
static scan_cb_t dsl_scan_scrub_cb;
-static void dsl_scan_cancel_sync(void *, dmu_tx_t *);
-static void dsl_scan_sync_state(dsl_scan_t *, dmu_tx_t *);
-static boolean_t dsl_scan_restarting(dsl_scan_t *, dmu_tx_t *);
-unsigned int zfs_top_maxinflight = 32; /* maximum I/Os per top-level */
-unsigned int zfs_resilver_delay = 2; /* number of ticks to delay resilver */
-unsigned int zfs_scrub_delay = 4; /* number of ticks to delay scrub */
+static int scan_ds_queue_compare(const void *a, const void *b);
+static int scan_prefetch_queue_compare(const void *a, const void *b);
+static void scan_ds_queue_clear(dsl_scan_t *scn);
+static boolean_t scan_ds_queue_contains(dsl_scan_t *scn, uint64_t dsobj,
+ uint64_t *txg);
+static void scan_ds_queue_insert(dsl_scan_t *scn, uint64_t dsobj, uint64_t txg);
+static void scan_ds_queue_remove(dsl_scan_t *scn, uint64_t dsobj);
+static void scan_ds_queue_sync(dsl_scan_t *scn, dmu_tx_t *tx);
+
+extern int zfs_vdev_async_write_active_min_dirty_percent;
+
+/*
+ * By default zfs will check to ensure it is not over the hard memory
+ * limit before each txg. If finer-grained control of this is needed
+ * this value can be set to 1 to enable checking before scanning each
+ * block.
+ */
+int zfs_scan_strict_mem_lim = B_FALSE;
+
+/*
+ * Maximum number of parallelly executing I/Os per top-level vdev.
+ * Tune with care. Very high settings (hundreds) are known to trigger
+ * some firmware bugs and resets on certain SSDs.
+ */
+int zfs_top_maxinflight = 32; /* maximum I/Os per top-level */
+unsigned int zfs_resilver_delay = 2; /* number of ticks to delay resilver -- 2 is a good number */
+unsigned int zfs_scrub_delay = 4; /* number of ticks to delay scrub -- 4 is a good number */
unsigned int zfs_scan_idle = 50; /* idle window in clock ticks */
-unsigned int zfs_scan_min_time_ms = 1000; /* min millisecs to scrub per txg */
+/*
+ * Maximum number of parallelly executed bytes per leaf vdev. We attempt
+ * to strike a balance here between keeping the vdev queues full of I/Os
+ * at all times and not overflowing the queues to cause long latency,
+ * which would cause long txg sync times. No matter what, we will not
+ * overload the drives with I/O, since that is protected by
+ * zfs_vdev_scrub_max_active.
+ */
+unsigned long zfs_scan_vdev_limit = 4 << 20;
+
+int zfs_scan_issue_strategy = 0;
+int zfs_scan_legacy = B_FALSE; /* don't queue & sort zios, go direct */
+uint64_t zfs_scan_max_ext_gap = 2 << 20; /* in bytes */
+
+unsigned int zfs_scan_checkpoint_intval = 7200; /* seconds */
+#define ZFS_SCAN_CHECKPOINT_INTVAL SEC_TO_TICK(zfs_scan_checkpoint_intval)
+
+/*
+ * fill_weight is non-tunable at runtime, so we copy it at module init from
+ * zfs_scan_fill_weight. Runtime adjustments to zfs_scan_fill_weight would
+ * break queue sorting.
+ */
+uint64_t zfs_scan_fill_weight = 3;
+static uint64_t fill_weight;
+
+/* See dsl_scan_should_clear() for details on the memory limit tunables */
+uint64_t zfs_scan_mem_lim_min = 16 << 20; /* bytes */
+uint64_t zfs_scan_mem_lim_soft_max = 128 << 20; /* bytes */
+int zfs_scan_mem_lim_fact = 20; /* fraction of physmem */
+int zfs_scan_mem_lim_soft_fact = 20; /* fraction of mem lim above */
+
+unsigned int zfs_scrub_min_time_ms = 1000; /* min millisecs to scrub per txg */
unsigned int zfs_free_min_time_ms = 1000; /* min millisecs to free per txg */
unsigned int zfs_obsolete_min_time_ms = 500; /* min millisecs to obsolete per txg */
-unsigned int zfs_resilver_min_time_ms = 3000; /* min millisecs to resilver
- per txg */
+unsigned int zfs_resilver_min_time_ms = 3000; /* min millisecs to resilver per txg */
boolean_t zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */
boolean_t zfs_no_scrub_prefetch = B_FALSE; /* set to disable scrub prefetch */
@@ -86,7 +194,7 @@
SYSCTL_UINT(_vfs_zfs, OID_AUTO, scan_idle, CTLFLAG_RWTUN,
&zfs_scan_idle, 0, "Idle scan window in clock ticks");
SYSCTL_UINT(_vfs_zfs, OID_AUTO, scan_min_time_ms, CTLFLAG_RWTUN,
- &zfs_scan_min_time_ms, 0, "Min millisecs to scrub per txg");
+ &zfs_scrub_min_time_ms, 0, "Min millisecs to scrub per txg");
SYSCTL_UINT(_vfs_zfs, OID_AUTO, free_min_time_ms, CTLFLAG_RWTUN,
&zfs_free_min_time_ms, 0, "Min millisecs to free per txg");
SYSCTL_UINT(_vfs_zfs, OID_AUTO, resilver_min_time_ms, CTLFLAG_RWTUN,
@@ -95,6 +203,10 @@
&zfs_no_scrub_io, 0, "Disable scrub I/O");
SYSCTL_INT(_vfs_zfs, OID_AUTO, no_scrub_prefetch, CTLFLAG_RWTUN,
&zfs_no_scrub_prefetch, 0, "Disable scrub prefetching");
+SYSCTL_UINT(_vfs_zfs, OID_AUTO, zfs_scan_legacy, CTLFLAG_RWTUN,
+ &zfs_scan_legacy, 0, "Scrub using legacy non-sequential method");
+SYSCTL_UINT(_vfs_zfs, OID_AUTO, zfs_scan_checkpoint_interval, CTLFLAG_RWTUN,
+ &zfs_scan_checkpoint_intval, 0, "Scan progress on-disk checkpointing interval");
enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE;
/* max number of blocks to free in a single TXG */
@@ -102,7 +214,19 @@
SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, free_max_blocks, CTLFLAG_RWTUN,
&zfs_async_block_max_blocks, 0, "Maximum number of blocks to free in one TXG");
+/*
+ * We wait a few txgs after importing a pool to begin scanning so that
+ * the import / mounting code isn't held up by scrub / resilver IO.
+ * Unfortunately, it is a bit difficult to determine exactly how long
+ * this will take since userspace will trigger fs mounts asynchronously
+ * and the kernel will create zvol minors asynchronously. As a result,
+ * the value provided here is a bit arbitrary, but represents a
+ * reasonable estimate of how many txgs it will take to finish fully
+ * importing a pool
+ */
+#define SCAN_IMPORT_WAIT_TXGS 5
+
#define DSL_SCAN_IS_SCRUB_RESILVER(scn) \
((scn)->scn_phys.scn_func == POOL_SCAN_SCRUB || \
(scn)->scn_phys.scn_func == POOL_SCAN_RESILVER)
@@ -124,6 +248,177 @@
dsl_scan_scrub_cb, /* POOL_SCAN_RESILVER */
};
+/* In core node for the scn->scn_queue. Represents a dataset to be scanned */
+typedef struct {
+ uint64_t sds_dsobj;
+ uint64_t sds_txg;
+ avl_node_t sds_node;
+} scan_ds_t;
+
+/*
+ * This controls what conditions are placed on dsl_scan_sync_state():
+ * SYNC_OPTIONAL) write out scn_phys iff scn_bytes_pending == 0
+ * SYNC_MANDATORY) write out scn_phys always. scn_bytes_pending must be 0.
+ * SYNC_CACHED) if scn_bytes_pending == 0, write out scn_phys. Otherwise
+ * write out the scn_phys_cached version.
+ * See dsl_scan_sync_state for details.
+ */
+typedef enum {
+ SYNC_OPTIONAL,
+ SYNC_MANDATORY,
+ SYNC_CACHED
+} state_sync_type_t;
+
+/*
+ * This struct represents the minimum information needed to reconstruct a
+ * zio for sequential scanning. This is useful because many of these will
+ * accumulate in the sequential IO queues before being issued, so saving
+ * memory matters here.
+ */
+typedef struct scan_io {
+ /* fields from blkptr_t */
+ uint64_t sio_offset;
+ uint64_t sio_blk_prop;
+ uint64_t sio_phys_birth;
+ uint64_t sio_birth;
+ zio_cksum_t sio_cksum;
+ uint32_t sio_asize;
+
+ /* fields from zio_t */
+ int sio_flags;
+ zbookmark_phys_t sio_zb;
+
+ /* members for queue sorting */
+ union {
+ avl_node_t sio_addr_node; /* link into issueing queue */
+ list_node_t sio_list_node; /* link for issuing to disk */
+ } sio_nodes;
+} scan_io_t;
+
+struct dsl_scan_io_queue {
+ dsl_scan_t *q_scn; /* associated dsl_scan_t */
+ vdev_t *q_vd; /* top-level vdev that this queue represents */
+
+ /* trees used for sorting I/Os and extents of I/Os */
+ range_tree_t *q_exts_by_addr;
+ avl_tree_t q_exts_by_size;
+ avl_tree_t q_sios_by_addr;
+
+ /* members for zio rate limiting */
+ uint64_t q_maxinflight_bytes;
+ uint64_t q_inflight_bytes;
+ kcondvar_t q_zio_cv; /* used under vd->vdev_scan_io_queue_lock */
+
+ /* per txg statistics */
+ uint64_t q_total_seg_size_this_txg;
+ uint64_t q_segs_this_txg;
+ uint64_t q_total_zio_size_this_txg;
+ uint64_t q_zios_this_txg;
+};
+
+/* private data for dsl_scan_prefetch_cb() */
+typedef struct scan_prefetch_ctx {
+ refcount_t spc_refcnt; /* refcount for memory management */
+ dsl_scan_t *spc_scn; /* dsl_scan_t for the pool */
+ boolean_t spc_root; /* is this prefetch for an objset? */
+ uint8_t spc_indblkshift; /* dn_indblkshift of current dnode */
+ uint16_t spc_datablkszsec; /* dn_idatablkszsec of current dnode */
+} scan_prefetch_ctx_t;
+
+/* private data for dsl_scan_prefetch() */
+typedef struct scan_prefetch_issue_ctx {
+ avl_node_t spic_avl_node; /* link into scn->scn_prefetch_queue */
+ scan_prefetch_ctx_t *spic_spc; /* spc for the callback */
+ blkptr_t spic_bp; /* bp to prefetch */
+ zbookmark_phys_t spic_zb; /* bookmark to prefetch */
+} scan_prefetch_issue_ctx_t;
+
+static void scan_exec_io(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags,
+ const zbookmark_phys_t *zb, dsl_scan_io_queue_t *queue);
+static void scan_io_queue_insert_impl(dsl_scan_io_queue_t *queue,
+ scan_io_t *sio);
+
+static dsl_scan_io_queue_t *scan_io_queue_create(vdev_t *vd);
+static void scan_io_queues_destroy(dsl_scan_t *scn);
+
+static kmem_cache_t *sio_cache;
+
+void
+scan_init(void)
+{
+ /*
+ * This is used in ext_size_compare() to weight segments
+ * based on how sparse they are. This cannot be changed
+ * mid-scan and the tree comparison functions don't currently
+ * have a mechansim for passing additional context to the
+ * compare functions. Thus we store this value globally and
+ * we only allow it to be set at module intiailization time
+ */
+ fill_weight = zfs_scan_fill_weight;
+
+ sio_cache = kmem_cache_create("sio_cache",
+ sizeof (scan_io_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+}
+
+void
+scan_fini(void)
+{
+ kmem_cache_destroy(sio_cache);
+}
+
+static inline boolean_t
+dsl_scan_is_running(const dsl_scan_t *scn)
+{
+ return (scn->scn_phys.scn_state == DSS_SCANNING);
+}
+
+boolean_t
+dsl_scan_resilvering(dsl_pool_t *dp)
+{
+ return (dsl_scan_is_running(dp->dp_scan) &&
+ dp->dp_scan->scn_phys.scn_func == POOL_SCAN_RESILVER);
+}
+
+static inline void
+sio2bp(const scan_io_t *sio, blkptr_t *bp, uint64_t vdev_id)
+{
+ bzero(bp, sizeof (*bp));
+ DVA_SET_ASIZE(&bp->blk_dva[0], sio->sio_asize);
+ DVA_SET_VDEV(&bp->blk_dva[0], vdev_id);
+ DVA_SET_OFFSET(&bp->blk_dva[0], sio->sio_offset);
+ bp->blk_prop = sio->sio_blk_prop;
+ bp->blk_phys_birth = sio->sio_phys_birth;
+ bp->blk_birth = sio->sio_birth;
+ bp->blk_fill = 1; /* we always only work with data pointers */
+ bp->blk_cksum = sio->sio_cksum;
+}
+
+static inline void
+bp2sio(const blkptr_t *bp, scan_io_t *sio, int dva_i)
+{
+ /* we discard the vdev id, since we can deduce it from the queue */
+ sio->sio_offset = DVA_GET_OFFSET(&bp->blk_dva[dva_i]);
+ sio->sio_asize = DVA_GET_ASIZE(&bp->blk_dva[dva_i]);
+ sio->sio_blk_prop = bp->blk_prop;
+ sio->sio_phys_birth = bp->blk_phys_birth;
+ sio->sio_birth = bp->blk_birth;
+ sio->sio_cksum = bp->blk_cksum;
+}
+
+void
+dsl_scan_global_init(void)
+{
+ /*
+ * This is used in ext_size_compare() to weight segments
+ * based on how sparse they are. This cannot be changed
+ * mid-scan and the tree comparison functions don't currently
+ * have a mechansim for passing additional context to the
+ * compare functions. Thus we store this value globally and
+ * we only allow it to be set at module intiailization time
+ */
+ fill_weight = zfs_scan_fill_weight;
+}
+
int
dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
{
@@ -144,6 +439,13 @@
scn->scn_async_destroying = spa_feature_is_active(dp->dp_spa,
SPA_FEATURE_ASYNC_DESTROY);
+ bcopy(&scn->scn_phys, &scn->scn_phys_cached, sizeof (scn->scn_phys));
+ avl_create(&scn->scn_queue, scan_ds_queue_compare, sizeof (scan_ds_t),
+ offsetof(scan_ds_t, sds_node));
+ avl_create(&scn->scn_prefetch_queue, scan_prefetch_queue_compare,
+ sizeof (scan_prefetch_issue_ctx_t),
+ offsetof(scan_prefetch_issue_ctx_t, spic_avl_node));
+
err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
"scrub_func", sizeof (uint64_t), 1, &f);
if (err == 0) {
@@ -154,7 +456,7 @@
scn->scn_restart_txg = txg;
zfs_dbgmsg("old-style scrub was in progress; "
"restarting new-style scrub in txg %llu",
- scn->scn_restart_txg);
+ (longlong_t)scn->scn_restart_txg);
/*
* Load the queue obj from the old location so that it
@@ -172,7 +474,14 @@
else if (err)
return (err);
- if (scn->scn_phys.scn_state == DSS_SCANNING &&
+ /*
+ * We might be restarting after a reboot, so jump the issued
+ * counter to how far we've scanned. We know we're consistent
+ * up to here.
+ */
+ scn->scn_issued_before_pass = scn->scn_phys.scn_examined;
+
+ if (dsl_scan_is_running(scn) &&
spa_prev_software_version(dp->dp_spa) < SPA_VERSION_SCAN) {
/*
* A new-type scrub was in progress on an old
@@ -184,10 +493,26 @@
scn->scn_restart_txg = txg;
zfs_dbgmsg("new-style scrub was modified "
"by old software; restarting in txg %llu",
- scn->scn_restart_txg);
+ (longlong_t)scn->scn_restart_txg);
}
}
+ /* reload the queue into the in-core state */
+ if (scn->scn_phys.scn_queue_obj != 0) {
+ zap_cursor_t zc;
+ zap_attribute_t za;
+
+ for (zap_cursor_init(&zc, dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj);
+ zap_cursor_retrieve(&zc, &za) == 0;
+ (void) zap_cursor_advance(&zc)) {
+ scan_ds_queue_insert(scn,
+ zfs_strtonum(za.za_name, NULL),
+ za.za_first_integer);
+ }
+ zap_cursor_fini(&zc);
+ }
+
spa_scan_stat_init(spa);
return (0);
}
@@ -195,19 +520,116 @@
void
dsl_scan_fini(dsl_pool_t *dp)
{
- if (dp->dp_scan) {
+ if (dp->dp_scan != NULL) {
+ dsl_scan_t *scn = dp->dp_scan;
+
+ if (scn->scn_taskq != NULL)
+ taskq_destroy(scn->scn_taskq);
+ scan_ds_queue_clear(scn);
+ avl_destroy(&scn->scn_queue);
+ avl_destroy(&scn->scn_prefetch_queue);
+
kmem_free(dp->dp_scan, sizeof (dsl_scan_t));
dp->dp_scan = NULL;
}
}
+static boolean_t
+dsl_scan_restarting(dsl_scan_t *scn, dmu_tx_t *tx)
+{
+ return (scn->scn_restart_txg != 0 &&
+ scn->scn_restart_txg <= tx->tx_txg);
+}
+
+boolean_t
+dsl_scan_scrubbing(const dsl_pool_t *dp)
+{
+ dsl_scan_phys_t *scn_phys = &dp->dp_scan->scn_phys;
+
+ return (scn_phys->scn_state == DSS_SCANNING &&
+ scn_phys->scn_func == POOL_SCAN_SCRUB);
+}
+
+boolean_t
+dsl_scan_is_paused_scrub(const dsl_scan_t *scn)
+{
+ return (dsl_scan_scrubbing(scn->scn_dp) &&
+ scn->scn_phys.scn_flags & DSF_SCRUB_PAUSED);
+}
+
+/*
+ * Writes out a persistent dsl_scan_phys_t record to the pool directory.
+ * Because we can be running in the block sorting algorithm, we do not always
+ * want to write out the record, only when it is "safe" to do so. This safety
+ * condition is achieved by making sure that the sorting queues are empty
+ * (scn_bytes_pending == 0). When this condition is not true, the sync'd state
+ * is inconsistent with how much actual scanning progress has been made. The
+ * kind of sync to be performed is specified by the sync_type argument. If the
+ * sync is optional, we only sync if the queues are empty. If the sync is
+ * mandatory, we do a hard ASSERT to make sure that the queues are empty. The
+ * third possible state is a "cached" sync. This is done in response to:
+ * 1) The dataset that was in the last sync'd dsl_scan_phys_t having been
+ * destroyed, so we wouldn't be able to restart scanning from it.
+ * 2) The snapshot that was in the last sync'd dsl_scan_phys_t having been
+ * superseded by a newer snapshot.
+ * 3) The dataset that was in the last sync'd dsl_scan_phys_t having been
+ * swapped with its clone.
+ * In all cases, a cached sync simply rewrites the last record we've written,
+ * just slightly modified. For the modifications that are performed to the
+ * last written dsl_scan_phys_t, see dsl_scan_ds_destroyed,
+ * dsl_scan_ds_snapshotted and dsl_scan_ds_clone_swapped.
+ */
+static void
+dsl_scan_sync_state(dsl_scan_t *scn, dmu_tx_t *tx, state_sync_type_t sync_type)
+{
+ int i;
+ spa_t *spa = scn->scn_dp->dp_spa;
+
+ ASSERT(sync_type != SYNC_MANDATORY || scn->scn_bytes_pending == 0);
+ if (scn->scn_bytes_pending == 0) {
+ for (i = 0; i < spa->spa_root_vdev->vdev_children; i++) {
+ vdev_t *vd = spa->spa_root_vdev->vdev_child[i];
+ dsl_scan_io_queue_t *q = vd->vdev_scan_io_queue;
+
+ if (q == NULL)
+ continue;
+
+ mutex_enter(&vd->vdev_scan_io_queue_lock);
+ ASSERT3P(avl_first(&q->q_sios_by_addr), ==, NULL);
+ ASSERT3P(avl_first(&q->q_exts_by_size), ==, NULL);
+ ASSERT3P(range_tree_first(q->q_exts_by_addr), ==, NULL);
+ mutex_exit(&vd->vdev_scan_io_queue_lock);
+ }
+
+ if (scn->scn_phys.scn_queue_obj != 0)
+ scan_ds_queue_sync(scn, tx);
+ VERIFY0(zap_update(scn->scn_dp->dp_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS,
+ &scn->scn_phys, tx));
+ bcopy(&scn->scn_phys, &scn->scn_phys_cached,
+ sizeof (scn->scn_phys));
+
+ if (scn->scn_checkpointing)
+ zfs_dbgmsg("finish scan checkpoint");
+
+ scn->scn_checkpointing = B_FALSE;
+ scn->scn_last_checkpoint = ddi_get_lbolt();
+ } else if (sync_type == SYNC_CACHED) {
+ VERIFY0(zap_update(scn->scn_dp->dp_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS,
+ &scn->scn_phys_cached, tx));
+ }
+}
+
/* ARGSUSED */
static int
dsl_scan_setup_check(void *arg, dmu_tx_t *tx)
{
dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
- if (scn->scn_phys.scn_state == DSS_SCANNING)
+ if (dsl_scan_is_running(scn))
return (SET_ERROR(EBUSY));
return (0);
@@ -222,7 +644,7 @@
dsl_pool_t *dp = scn->scn_dp;
spa_t *spa = dp->dp_spa;
- ASSERT(scn->scn_phys.scn_state != DSS_SCANNING);
+ ASSERT(!dsl_scan_is_running(scn));
ASSERT(*funcp > POOL_SCAN_NONE && *funcp < POOL_SCAN_FUNCS);
bzero(&scn->scn_phys, sizeof (scn->scn_phys));
scn->scn_phys.scn_func = *funcp;
@@ -233,8 +655,11 @@
scn->scn_phys.scn_start_time = gethrestime_sec();
scn->scn_phys.scn_errors = 0;
scn->scn_phys.scn_to_examine = spa->spa_root_vdev->vdev_stat.vs_alloc;
+ scn->scn_issued_before_pass = 0;
scn->scn_restart_txg = 0;
scn->scn_done_txg = 0;
+ scn->scn_last_checkpoint = 0;
+ scn->scn_checkpointing = B_FALSE;
spa_scan_stat_init(spa);
if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
@@ -267,8 +692,10 @@
if (dp->dp_blkstats == NULL) {
dp->dp_blkstats =
kmem_alloc(sizeof (zfs_all_blkstats_t), KM_SLEEP);
+ mutex_init(&dp->dp_blkstats->zab_lock, NULL,
+ MUTEX_DEFAULT, NULL);
}
- bzero(dp->dp_blkstats, sizeof (zfs_all_blkstats_t));
+ bzero(&dp->dp_blkstats->zab_type, sizeof (dp->dp_blkstats->zab_type));
if (spa_version(spa) < SPA_VERSION_DSL_SCRUB)
ot = DMU_OT_ZAP_OTHER;
@@ -276,13 +703,52 @@
scn->scn_phys.scn_queue_obj = zap_create(dp->dp_meta_objset,
ot ? ot : DMU_OT_SCAN_QUEUE, DMU_OT_NONE, 0, tx);
- dsl_scan_sync_state(scn, tx);
+ bcopy(&scn->scn_phys, &scn->scn_phys_cached, sizeof (scn->scn_phys));
+ dsl_scan_sync_state(scn, tx, SYNC_MANDATORY);
+
spa_history_log_internal(spa, "scan setup", tx,
"func=%u mintxg=%llu maxtxg=%llu",
*funcp, scn->scn_phys.scn_min_txg, scn->scn_phys.scn_max_txg);
}
+/*
+ * Called by the ZFS_IOC_POOL_SCAN ioctl to start a scrub or resilver.
+ * Can also be called to resume a paused scrub.
+ */
+int
+dsl_scan(dsl_pool_t *dp, pool_scan_func_t func)
+{
+ spa_t *spa = dp->dp_spa;
+ dsl_scan_t *scn = dp->dp_scan;
+
+ /*
+ * Purge all vdev caches and probe all devices. We do this here
+ * rather than in sync context because this requires a writer lock
+ * on the spa_config lock, which we can't do from sync context. The
+ * spa_scrub_reopen flag indicates that vdev_open() should not
+ * attempt to start another scrub.
+ */
+ spa_vdev_state_enter(spa, SCL_NONE);
+ spa->spa_scrub_reopen = B_TRUE;
+ vdev_reopen(spa->spa_root_vdev);
+ spa->spa_scrub_reopen = B_FALSE;
+ (void) spa_vdev_state_exit(spa, NULL, 0);
+
+ if (func == POOL_SCAN_SCRUB && dsl_scan_is_paused_scrub(scn)) {
+ /* got scrub start cmd, resume paused scrub */
+ int err = dsl_scrub_set_pause_resume(scn->scn_dp,
+ POOL_SCRUB_NORMAL);
+ if (err == 0)
+ return (ECANCELED);
+
+ return (SET_ERROR(err));
+ }
+
+ return (dsl_sync_task(spa_name(spa), dsl_scan_setup_check,
+ dsl_scan_setup_sync, &func, 0, ZFS_SPACE_CHECK_NONE));
+}
+
/* ARGSUSED */
static void
dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
@@ -310,10 +776,11 @@
}
if (scn->scn_phys.scn_queue_obj != 0) {
- VERIFY(0 == dmu_object_free(dp->dp_meta_objset,
+ VERIFY0(dmu_object_free(dp->dp_meta_objset,
scn->scn_phys.scn_queue_obj, tx));
scn->scn_phys.scn_queue_obj = 0;
}
+ scan_ds_queue_clear(scn);
scn->scn_phys.scn_flags &= ~DSF_SCRUB_PAUSED;
@@ -321,14 +788,23 @@
* If we were "restarted" from a stopped state, don't bother
* with anything else.
*/
- if (scn->scn_phys.scn_state != DSS_SCANNING)
+ if (!dsl_scan_is_running(scn)) {
+ ASSERT(!scn->scn_is_sorted);
return;
+ }
- if (complete)
- scn->scn_phys.scn_state = DSS_FINISHED;
- else
- scn->scn_phys.scn_state = DSS_CANCELED;
+ if (scn->scn_is_sorted) {
+ scan_io_queues_destroy(scn);
+ scn->scn_is_sorted = B_FALSE;
+ if (scn->scn_taskq != NULL) {
+ taskq_destroy(scn->scn_taskq);
+ scn->scn_taskq = NULL;
+ }
+ }
+
+ scn->scn_phys.scn_state = complete ? DSS_FINISHED : DSS_CANCELED;
+
if (dsl_scan_restarting(scn, tx))
spa_history_log_internal(spa, "scan aborted, restarting", tx,
"errors=%llu", spa_get_errlog_size(spa));
@@ -340,12 +816,6 @@
"errors=%llu", spa_get_errlog_size(spa));
if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
- mutex_enter(&spa->spa_scrub_lock);
- while (spa->spa_scrub_inflight > 0) {
- cv_wait(&spa->spa_scrub_io_cv,
- &spa->spa_scrub_lock);
- }
- mutex_exit(&spa->spa_scrub_lock);
spa->spa_scrub_started = B_FALSE;
spa->spa_scrub_active = B_FALSE;
@@ -381,6 +851,8 @@
}
scn->scn_phys.scn_end_time = gethrestime_sec();
+
+ ASSERT(!dsl_scan_is_running(scn));
}
/* ARGSUSED */
@@ -389,7 +861,7 @@
{
dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
- if (scn->scn_phys.scn_state != DSS_SCANNING)
+ if (!dsl_scan_is_running(scn))
return (SET_ERROR(ENOENT));
return (0);
}
@@ -401,7 +873,7 @@
dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
dsl_scan_done(scn, B_FALSE, tx);
- dsl_scan_sync_state(scn, tx);
+ dsl_scan_sync_state(scn, tx, SYNC_MANDATORY);
spa_event_notify(scn->scn_dp->dp_spa, NULL, NULL, ESC_ZFS_SCRUB_ABORT);
}
@@ -412,16 +884,6 @@
dsl_scan_cancel_sync, NULL, 3, ZFS_SPACE_CHECK_RESERVED));
}
-boolean_t
-dsl_scan_is_paused_scrub(const dsl_scan_t *scn)
-{
- if (dsl_scan_scrubbing(scn->scn_dp) &&
- scn->scn_phys.scn_flags & DSF_SCRUB_PAUSED)
- return (B_TRUE);
-
- return (B_FALSE);
-}
-
static int
dsl_scrub_pause_resume_check(void *arg, dmu_tx_t *tx)
{
@@ -456,7 +918,7 @@
/* can't pause a scrub when there is no in-progress scrub */
spa->spa_scan_pass_scrub_pause = gethrestime_sec();
scn->scn_phys.scn_flags |= DSF_SCRUB_PAUSED;
- dsl_scan_sync_state(scn, tx);
+ dsl_scan_sync_state(scn, tx, SYNC_CACHED);
spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_PAUSED);
} else {
ASSERT3U(*cmd, ==, POOL_SCRUB_NORMAL);
@@ -470,7 +932,7 @@
gethrestime_sec() - spa->spa_scan_pass_scrub_pause;
spa->spa_scan_pass_scrub_pause = 0;
scn->scn_phys.scn_flags &= ~DSF_SCRUB_PAUSED;
- dsl_scan_sync_state(scn, tx);
+ dsl_scan_sync_state(scn, tx, SYNC_CACHED);
}
}
}
@@ -486,25 +948,25 @@
ZFS_SPACE_CHECK_RESERVED));
}
-boolean_t
-dsl_scan_scrubbing(const dsl_pool_t *dp)
+
+/* start a new scan, or restart an existing one. */
+void
+dsl_resilver_restart(dsl_pool_t *dp, uint64_t txg)
{
- dsl_scan_t *scn = dp->dp_scan;
+ if (txg == 0) {
+ dmu_tx_t *tx;
+ tx = dmu_tx_create_dd(dp->dp_mos_dir);
+ VERIFY(0 == dmu_tx_assign(tx, TXG_WAIT));
- if (scn->scn_phys.scn_state == DSS_SCANNING &&
- scn->scn_phys.scn_func == POOL_SCAN_SCRUB)
- return (B_TRUE);
-
- return (B_FALSE);
+ txg = dmu_tx_get_txg(tx);
+ dp->dp_scan->scn_restart_txg = txg;
+ dmu_tx_commit(tx);
+ } else {
+ dp->dp_scan->scn_restart_txg = txg;
+ }
+ zfs_dbgmsg("restarting resilver txg=%llu", txg);
}
-static void dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb,
- dnode_phys_t *dnp, dsl_dataset_t *ds, dsl_scan_t *scn,
- dmu_objset_type_t ostype, dmu_tx_t *tx);
-static void dsl_scan_visitdnode(dsl_scan_t *, dsl_dataset_t *ds,
- dmu_objset_type_t ostype,
- dnode_phys_t *dnp, uint64_t object, dmu_tx_t *tx);
-
void
dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bp)
{
@@ -519,27 +981,171 @@
pio->io_flags));
}
-static uint64_t
-dsl_scan_ds_maxtxg(dsl_dataset_t *ds)
+static int
+scan_ds_queue_compare(const void *a, const void *b)
{
- uint64_t smt = ds->ds_dir->dd_pool->dp_scan->scn_phys.scn_max_txg;
- if (ds->ds_is_snapshot)
- return (MIN(smt, dsl_dataset_phys(ds)->ds_creation_txg));
- return (smt);
+ const scan_ds_t *sds_a = a, *sds_b = b;
+
+ if (sds_a->sds_dsobj < sds_b->sds_dsobj)
+ return (-1);
+ if (sds_a->sds_dsobj == sds_b->sds_dsobj)
+ return (0);
+ return (1);
}
static void
-dsl_scan_sync_state(dsl_scan_t *scn, dmu_tx_t *tx)
+scan_ds_queue_clear(dsl_scan_t *scn)
{
- VERIFY0(zap_update(scn->scn_dp->dp_meta_objset,
- DMU_POOL_DIRECTORY_OBJECT,
- DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS,
- &scn->scn_phys, tx));
+ void *cookie = NULL;
+ scan_ds_t *sds;
+ while ((sds = avl_destroy_nodes(&scn->scn_queue, &cookie)) != NULL) {
+ kmem_free(sds, sizeof (*sds));
+ }
}
-extern int zfs_vdev_async_write_active_min_dirty_percent;
+static boolean_t
+scan_ds_queue_contains(dsl_scan_t *scn, uint64_t dsobj, uint64_t *txg)
+{
+ scan_ds_t srch, *sds;
+ srch.sds_dsobj = dsobj;
+ sds = avl_find(&scn->scn_queue, &srch, NULL);
+ if (sds != NULL && txg != NULL)
+ *txg = sds->sds_txg;
+ return (sds != NULL);
+}
+
+static void
+scan_ds_queue_insert(dsl_scan_t *scn, uint64_t dsobj, uint64_t txg)
+{
+ scan_ds_t *sds;
+ avl_index_t where;
+
+ sds = kmem_zalloc(sizeof (*sds), KM_SLEEP);
+ sds->sds_dsobj = dsobj;
+ sds->sds_txg = txg;
+
+ VERIFY3P(avl_find(&scn->scn_queue, sds, &where), ==, NULL);
+ avl_insert(&scn->scn_queue, sds, where);
+}
+
+static void
+scan_ds_queue_remove(dsl_scan_t *scn, uint64_t dsobj)
+{
+ scan_ds_t srch, *sds;
+
+ srch.sds_dsobj = dsobj;
+
+ sds = avl_find(&scn->scn_queue, &srch, NULL);
+ VERIFY(sds != NULL);
+ avl_remove(&scn->scn_queue, sds);
+ kmem_free(sds, sizeof (*sds));
+}
+
+static void
+scan_ds_queue_sync(dsl_scan_t *scn, dmu_tx_t *tx)
+{
+ dsl_pool_t *dp = scn->scn_dp;
+ spa_t *spa = dp->dp_spa;
+ dmu_object_type_t ot = (spa_version(spa) >= SPA_VERSION_DSL_SCRUB) ?
+ DMU_OT_SCAN_QUEUE : DMU_OT_ZAP_OTHER;
+
+ ASSERT0(scn->scn_bytes_pending);
+ ASSERT(scn->scn_phys.scn_queue_obj != 0);
+
+ VERIFY0(dmu_object_free(dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj, tx));
+ scn->scn_phys.scn_queue_obj = zap_create(dp->dp_meta_objset, ot,
+ DMU_OT_NONE, 0, tx);
+ for (scan_ds_t *sds = avl_first(&scn->scn_queue);
+ sds != NULL; sds = AVL_NEXT(&scn->scn_queue, sds)) {
+ VERIFY0(zap_add_int_key(dp->dp_meta_objset,
+ scn->scn_phys.scn_queue_obj, sds->sds_dsobj,
+ sds->sds_txg, tx));
+ }
+}
+
+/*
+ * Computes the memory limit state that we're currently in. A sorted scan
+ * needs quite a bit of memory to hold the sorting queue, so we need to
+ * reasonably constrain the size so it doesn't impact overall system
+ * performance. We compute two limits:
+ * 1) Hard memory limit: if the amount of memory used by the sorting
+ * queues on a pool gets above this value, we stop the metadata
+ * scanning portion and start issuing the queued up and sorted
+ * I/Os to reduce memory usage.
+ * This limit is calculated as a fraction of physmem (by default 5%).
+ * We constrain the lower bound of the hard limit to an absolute
+ * minimum of zfs_scan_mem_lim_min (default: 16 MiB). We also constrain
+ * the upper bound to 5% of the total pool size - no chance we'll
+ * ever need that much memory, but just to keep the value in check.
+ * 2) Soft memory limit: once we hit the hard memory limit, we start
+ * issuing I/O to reduce queue memory usage, but we don't want to
+ * completely empty out the queues, since we might be able to find I/Os
+ * that will fill in the gaps of our non-sequential IOs at some point
+ * in the future. So we stop the issuing of I/Os once the amount of
+ * memory used drops below the soft limit (at which point we stop issuing
+ * I/O and start scanning metadata again).
+ *
+ * This limit is calculated by subtracting a fraction of the hard
+ * limit from the hard limit. By default this fraction is 5%, so
+ * the soft limit is 95% of the hard limit. We cap the size of the
+ * difference between the hard and soft limits at an absolute
+ * maximum of zfs_scan_mem_lim_soft_max (default: 128 MiB) - this is
+ * sufficient to not cause too frequent switching between the
+ * metadata scan and I/O issue (even at 2k recordsize, 128 MiB's
+ * worth of queues is about 1.2 GiB of on-pool data, so scanning
+ * that should take at least a decent fraction of a second).
+ */
static boolean_t
+dsl_scan_should_clear(dsl_scan_t *scn)
+{
+ vdev_t *rvd = scn->scn_dp->dp_spa->spa_root_vdev;
+ uint64_t mlim_hard, mlim_soft, mused;
+ uint64_t alloc = metaslab_class_get_alloc(spa_normal_class(
+ scn->scn_dp->dp_spa));
+
+ mlim_hard = MAX((physmem / zfs_scan_mem_lim_fact) * PAGESIZE,
+ zfs_scan_mem_lim_min);
+ mlim_hard = MIN(mlim_hard, alloc / 20);
+ mlim_soft = mlim_hard - MIN(mlim_hard / zfs_scan_mem_lim_soft_fact,
+ zfs_scan_mem_lim_soft_max);
+ mused = 0;
+ for (uint64_t i = 0; i < rvd->vdev_children; i++) {
+ vdev_t *tvd = rvd->vdev_child[i];
+ dsl_scan_io_queue_t *queue;
+
+ mutex_enter(&tvd->vdev_scan_io_queue_lock);
+ queue = tvd->vdev_scan_io_queue;
+ if (queue != NULL) {
+ /* #extents in exts_by_size = # in exts_by_addr */
+ mused += avl_numnodes(&queue->q_exts_by_size) *
+ sizeof (range_seg_t) +
+ avl_numnodes(&queue->q_sios_by_addr) *
+ sizeof (scan_io_t);
+ }
+ mutex_exit(&tvd->vdev_scan_io_queue_lock);
+ }
+
+ dprintf("current scan memory usage: %llu bytes\n", (longlong_t)mused);
+
+ if (mused == 0)
+ ASSERT0(scn->scn_bytes_pending);
+
+ /*
+ * If we are above our hard limit, we need to clear out memory.
+ * If we are below our soft limit, we need to accumulate sequential IOs.
+ * Otherwise, we should keep doing whatever we are currently doing.
+ */
+ if (mused >= mlim_hard)
+ return (B_TRUE);
+ else if (mused < mlim_soft)
+ return (B_FALSE);
+ else
+ return (scn->scn_clearing);
+}
+
+static boolean_t
dsl_scan_check_suspend(dsl_scan_t *scn, const zbookmark_phys_t *zb)
{
/* we never skip user/group accounting objects */
@@ -558,9 +1164,6 @@
/*
* We suspend if:
- * - we have scanned for the maximum time: an entire txg
- * timeout (default 5 sec)
- * or
* - we have scanned for at least the minimum time (default 1 sec
* for scrub, 3 sec for resilver), and either we have sufficient
* dirty data that we are starting to write more quickly
@@ -569,16 +1172,25 @@
* or
* - the spa is shutting down because this pool is being exported
* or the machine is rebooting.
+ * or
+ * - the scan queue has reached its memory use limit
*/
- int mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ?
- zfs_resilver_min_time_ms : zfs_scan_min_time_ms;
- uint64_t elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time;
+ uint64_t elapsed_nanosecs = gethrtime();
+ uint64_t curr_time_ns = gethrtime();
+ uint64_t scan_time_ns = curr_time_ns - scn->scn_sync_start_time;
+ uint64_t sync_time_ns = curr_time_ns -
+ scn->scn_dp->dp_spa->spa_sync_starttime;
+
int dirty_pct = scn->scn_dp->dp_dirty_total * 100 / zfs_dirty_data_max;
- if (elapsed_nanosecs / NANOSEC >= zfs_txg_timeout ||
- (NSEC2MSEC(elapsed_nanosecs) > mintime &&
- (txg_sync_waiting(scn->scn_dp) ||
- dirty_pct >= zfs_vdev_async_write_active_min_dirty_percent)) ||
- spa_shutting_down(scn->scn_dp->dp_spa)) {
+ int mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ?
+ zfs_resilver_min_time_ms : zfs_scrub_min_time_ms;
+
+ if ((NSEC2MSEC(scan_time_ns) > mintime &&
+ (dirty_pct >= zfs_vdev_async_write_active_min_dirty_percent ||
+ txg_sync_waiting(scn->scn_dp) ||
+ NSEC2SEC(sync_time_ns) >= zfs_txg_timeout)) ||
+ spa_shutting_down(scn->scn_dp->dp_spa) ||
+ (zfs_scan_strict_mem_lim && dsl_scan_should_clear(scn))) {
if (zb) {
dprintf("suspending at bookmark %llx/%llx/%llx/%llx\n",
(longlong_t)zb->zb_objset,
@@ -586,12 +1198,16 @@
(longlong_t)zb->zb_level,
(longlong_t)zb->zb_blkid);
scn->scn_phys.scn_bookmark = *zb;
+ } else {
+ dsl_scan_phys_t *scnp = &scn->scn_phys;
+
+ dprintf("suspending at at DDT bookmark "
+ "%llx/%llx/%llx/%llx\n",
+ (longlong_t)scnp->scn_ddt_bookmark.ddb_class,
+ (longlong_t)scnp->scn_ddt_bookmark.ddb_type,
+ (longlong_t)scnp->scn_ddt_bookmark.ddb_checksum,
+ (longlong_t)scnp->scn_ddt_bookmark.ddb_cursor);
}
- dprintf("suspending at DDT bookmark %llx/%llx/%llx/%llx\n",
- (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_class,
- (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_type,
- (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_checksum,
- (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_cursor);
scn->scn_suspending = B_TRUE;
return (B_TRUE);
}
@@ -690,28 +1306,278 @@
zil_free(zilog);
}
-/* ARGSUSED */
+/*
+ * We compare scan_prefetch_issue_ctx_t's based on their bookmarks. The idea
+ * here is to sort the AVL tree by the order each block will be needed.
+ */
+static int
+scan_prefetch_queue_compare(const void *a, const void *b)
+{
+ const scan_prefetch_issue_ctx_t *spic_a = a, *spic_b = b;
+ const scan_prefetch_ctx_t *spc_a = spic_a->spic_spc;
+ const scan_prefetch_ctx_t *spc_b = spic_b->spic_spc;
+
+ return (zbookmark_compare(spc_a->spc_datablkszsec,
+ spc_a->spc_indblkshift, spc_b->spc_datablkszsec,
+ spc_b->spc_indblkshift, &spic_a->spic_zb, &spic_b->spic_zb));
+}
+
static void
-dsl_scan_prefetch(dsl_scan_t *scn, arc_buf_t *buf, blkptr_t *bp,
- uint64_t objset, uint64_t object, uint64_t blkid)
+scan_prefetch_ctx_rele(scan_prefetch_ctx_t *spc, void *tag)
{
- zbookmark_phys_t czb;
- arc_flags_t flags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
+ if (refcount_remove(&spc->spc_refcnt, tag) == 0) {
+ refcount_destroy(&spc->spc_refcnt);
+ kmem_free(spc, sizeof (scan_prefetch_ctx_t));
+ }
+}
+static scan_prefetch_ctx_t *
+scan_prefetch_ctx_create(dsl_scan_t *scn, dnode_phys_t *dnp, void *tag)
+{
+ scan_prefetch_ctx_t *spc;
+
+ spc = kmem_alloc(sizeof (scan_prefetch_ctx_t), KM_SLEEP);
+ refcount_create(&spc->spc_refcnt);
+ refcount_add(&spc->spc_refcnt, tag);
+ spc->spc_scn = scn;
+ if (dnp != NULL) {
+ spc->spc_datablkszsec = dnp->dn_datablkszsec;
+ spc->spc_indblkshift = dnp->dn_indblkshift;
+ spc->spc_root = B_FALSE;
+ } else {
+ spc->spc_datablkszsec = 0;
+ spc->spc_indblkshift = 0;
+ spc->spc_root = B_TRUE;
+ }
+
+ return (spc);
+}
+
+static void
+scan_prefetch_ctx_add_ref(scan_prefetch_ctx_t *spc, void *tag)
+{
+ refcount_add(&spc->spc_refcnt, tag);
+}
+
+static boolean_t
+dsl_scan_check_prefetch_resume(scan_prefetch_ctx_t *spc,
+ const zbookmark_phys_t *zb)
+{
+ zbookmark_phys_t *last_zb = &spc->spc_scn->scn_prefetch_bookmark;
+ dnode_phys_t tmp_dnp;
+ dnode_phys_t *dnp = (spc->spc_root) ? NULL : &tmp_dnp;
+
+ if (zb->zb_objset != last_zb->zb_objset)
+ return (B_TRUE);
+ if ((int64_t)zb->zb_object < 0)
+ return (B_FALSE);
+
+ tmp_dnp.dn_datablkszsec = spc->spc_datablkszsec;
+ tmp_dnp.dn_indblkshift = spc->spc_indblkshift;
+
+ if (zbookmark_subtree_completed(dnp, zb, last_zb))
+ return (B_TRUE);
+
+ return (B_FALSE);
+}
+
+static void
+dsl_scan_prefetch(scan_prefetch_ctx_t *spc, blkptr_t *bp, zbookmark_phys_t *zb)
+{
+ avl_index_t idx;
+ dsl_scan_t *scn = spc->spc_scn;
+ spa_t *spa = scn->scn_dp->dp_spa;
+ scan_prefetch_issue_ctx_t *spic;
+
if (zfs_no_scrub_prefetch)
return;
- if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_min_txg ||
- (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE))
+ if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_cur_min_txg ||
+ (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE &&
+ BP_GET_TYPE(bp) != DMU_OT_OBJSET))
return;
- SET_BOOKMARK(&czb, objset, object, BP_GET_LEVEL(bp), blkid);
+ if (dsl_scan_check_prefetch_resume(spc, zb))
+ return;
- (void) arc_read(scn->scn_zio_root, scn->scn_dp->dp_spa, bp,
- NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
- ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD, &flags, &czb);
+ scan_prefetch_ctx_add_ref(spc, scn);
+ spic = kmem_alloc(sizeof (scan_prefetch_issue_ctx_t), KM_SLEEP);
+ spic->spic_spc = spc;
+ spic->spic_bp = *bp;
+ spic->spic_zb = *zb;
+
+ /*
+ * Add the IO to the queue of blocks to prefetch. This allows us to
+ * prioritize blocks that we will need first for the main traversal
+ * thread.
+ */
+ mutex_enter(&spa->spa_scrub_lock);
+ if (avl_find(&scn->scn_prefetch_queue, spic, &idx) != NULL) {
+ /* this block is already queued for prefetch */
+ kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t));
+ scan_prefetch_ctx_rele(spc, scn);
+ mutex_exit(&spa->spa_scrub_lock);
+ return;
+ }
+
+ avl_insert(&scn->scn_prefetch_queue, spic, idx);
+ cv_broadcast(&spa->spa_scrub_io_cv);
+ mutex_exit(&spa->spa_scrub_lock);
}
+static void
+dsl_scan_prefetch_dnode(dsl_scan_t *scn, dnode_phys_t *dnp,
+ uint64_t objset, uint64_t object)
+{
+ int i;
+ zbookmark_phys_t zb;
+ scan_prefetch_ctx_t *spc;
+
+ if (dnp->dn_nblkptr == 0 && !(dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR))
+ return;
+
+ SET_BOOKMARK(&zb, objset, object, 0, 0);
+
+ spc = scan_prefetch_ctx_create(scn, dnp, FTAG);
+
+ for (i = 0; i < dnp->dn_nblkptr; i++) {
+ zb.zb_level = BP_GET_LEVEL(&dnp->dn_blkptr[i]);
+ zb.zb_blkid = i;
+ dsl_scan_prefetch(spc, &dnp->dn_blkptr[i], &zb);
+ }
+
+ if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
+ zb.zb_level = 0;
+ zb.zb_blkid = DMU_SPILL_BLKID;
+ dsl_scan_prefetch(spc, &dnp->dn_spill, &zb);
+ }
+
+ scan_prefetch_ctx_rele(spc, FTAG);
+}
+
+void
+dsl_scan_prefetch_cb(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
+ arc_buf_t *buf, void *private)
+{
+ scan_prefetch_ctx_t *spc = private;
+ dsl_scan_t *scn = spc->spc_scn;
+ spa_t *spa = scn->scn_dp->dp_spa;
+
+ /* broadcast that the IO has completed for rate limitting purposes */
+ mutex_enter(&spa->spa_scrub_lock);
+ ASSERT3U(spa->spa_scrub_inflight, >=, BP_GET_PSIZE(bp));
+ spa->spa_scrub_inflight -= BP_GET_PSIZE(bp);
+ cv_broadcast(&spa->spa_scrub_io_cv);
+ mutex_exit(&spa->spa_scrub_lock);
+
+ /* if there was an error or we are done prefetching, just cleanup */
+ if (buf == NULL || scn->scn_suspending)
+ goto out;
+
+ if (BP_GET_LEVEL(bp) > 0) {
+ int i;
+ blkptr_t *cbp;
+ int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
+ zbookmark_phys_t czb;
+
+ for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) {
+ SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
+ zb->zb_level - 1, zb->zb_blkid * epb + i);
+ dsl_scan_prefetch(spc, cbp, &czb);
+ }
+ } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
+ dnode_phys_t *cdnp = buf->b_data;
+ int i;
+ int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
+
+ for (i = 0, cdnp = buf->b_data; i < epb; i++, cdnp++) {
+ dsl_scan_prefetch_dnode(scn, cdnp,
+ zb->zb_objset, zb->zb_blkid * epb + i);
+ }
+ } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
+ objset_phys_t *osp = buf->b_data;
+
+ dsl_scan_prefetch_dnode(scn, &osp->os_meta_dnode,
+ zb->zb_objset, DMU_META_DNODE_OBJECT);
+
+ if (OBJSET_BUF_HAS_USERUSED(buf)) {
+ dsl_scan_prefetch_dnode(scn,
+ &osp->os_groupused_dnode, zb->zb_objset,
+ DMU_GROUPUSED_OBJECT);
+ dsl_scan_prefetch_dnode(scn,
+ &osp->os_userused_dnode, zb->zb_objset,
+ DMU_USERUSED_OBJECT);
+ }
+ }
+
+out:
+ if (buf != NULL)
+ arc_buf_destroy(buf, private);
+ scan_prefetch_ctx_rele(spc, scn);
+}
+
+/* ARGSUSED */
+static void
+dsl_scan_prefetch_thread(void *arg)
+{
+ dsl_scan_t *scn = arg;
+ spa_t *spa = scn->scn_dp->dp_spa;
+ vdev_t *rvd = spa->spa_root_vdev;
+ uint64_t maxinflight = rvd->vdev_children * zfs_top_maxinflight;
+ scan_prefetch_issue_ctx_t *spic;
+
+ /* loop until we are told to stop */
+ while (!scn->scn_prefetch_stop) {
+ arc_flags_t flags = ARC_FLAG_NOWAIT |
+ ARC_FLAG_PRESCIENT_PREFETCH | ARC_FLAG_PREFETCH;
+ int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD;
+
+ mutex_enter(&spa->spa_scrub_lock);
+
+ /*
+ * Wait until we have an IO to issue and are not above our
+ * maximum in flight limit.
+ */
+ while (!scn->scn_prefetch_stop &&
+ (avl_numnodes(&scn->scn_prefetch_queue) == 0 ||
+ spa->spa_scrub_inflight >= scn->scn_maxinflight_bytes)) {
+ cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
+ }
+
+ /* recheck if we should stop since we waited for the cv */
+ if (scn->scn_prefetch_stop) {
+ mutex_exit(&spa->spa_scrub_lock);
+ break;
+ }
+
+ /* remove the prefetch IO from the tree */
+ spic = avl_first(&scn->scn_prefetch_queue);
+ spa->spa_scrub_inflight += BP_GET_PSIZE(&spic->spic_bp);
+ avl_remove(&scn->scn_prefetch_queue, spic);
+
+ mutex_exit(&spa->spa_scrub_lock);
+
+ /* issue the prefetch asynchronously */
+ (void) arc_read(scn->scn_zio_root, scn->scn_dp->dp_spa,
+ &spic->spic_bp, dsl_scan_prefetch_cb, spic->spic_spc,
+ ZIO_PRIORITY_SCRUB, zio_flags, &flags, &spic->spic_zb);
+
+ kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t));
+ }
+
+ ASSERT(scn->scn_prefetch_stop);
+
+ /* free any prefetches we didn't get to complete */
+ mutex_enter(&spa->spa_scrub_lock);
+ while ((spic = avl_first(&scn->scn_prefetch_queue)) != NULL) {
+ avl_remove(&scn->scn_prefetch_queue, spic);
+ scan_prefetch_ctx_rele(spic->spic_spc, scn);
+ kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t));
+ }
+ ASSERT0(avl_numnodes(&scn->scn_prefetch_queue));
+ mutex_exit(&spa->spa_scrub_lock);
+}
+
static boolean_t
dsl_scan_check_resume(dsl_scan_t *scn, const dnode_phys_t *dnp,
const zbookmark_phys_t *zb)
@@ -748,6 +1614,13 @@
return (B_FALSE);
}
+static void dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb,
+ dnode_phys_t *dnp, dsl_dataset_t *ds, dsl_scan_t *scn,
+ dmu_objset_type_t ostype, dmu_tx_t *tx);
+static void dsl_scan_visitdnode(
+ dsl_scan_t *, dsl_dataset_t *ds, dmu_objset_type_t ostype,
+ dnode_phys_t *dnp, uint64_t object, dmu_tx_t *tx);
+
/*
* Return nonzero on i/o error.
* Return new buf to write out in *bufp.
@@ -769,16 +1642,12 @@
arc_buf_t *buf;
err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf,
- ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
+ ZIO_PRIORITY_SCRUB, zio_flags, &flags, zb);
if (err) {
scn->scn_phys.scn_errors++;
return (err);
}
for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) {
- dsl_scan_prefetch(scn, buf, cbp, zb->zb_objset,
- zb->zb_object, zb->zb_blkid * epb + i);
- }
- for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) {
zbookmark_phys_t czb;
SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
@@ -791,24 +1660,17 @@
} else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
arc_flags_t flags = ARC_FLAG_WAIT;
dnode_phys_t *cdnp;
- int i, j;
+ int i;
int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
arc_buf_t *buf;
err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf,
- ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
+ ZIO_PRIORITY_SCRUB, zio_flags, &flags, zb);
if (err) {
scn->scn_phys.scn_errors++;
return (err);
}
for (i = 0, cdnp = buf->b_data; i < epb; i++, cdnp++) {
- for (j = 0; j < cdnp->dn_nblkptr; j++) {
- blkptr_t *cbp = &cdnp->dn_blkptr[j];
- dsl_scan_prefetch(scn, buf, cbp,
- zb->zb_objset, zb->zb_blkid * epb + i, j);
- }
- }
- for (i = 0, cdnp = buf->b_data; i < epb; i++, cdnp++) {
dsl_scan_visitdnode(scn, ds, ostype,
cdnp, zb->zb_blkid * epb + i, tx);
}
@@ -820,7 +1682,7 @@
arc_buf_t *buf;
err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf,
- ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
+ ZIO_PRIORITY_SCRUB, zio_flags, &flags, zb);
if (err) {
scn->scn_phys.scn_errors++;
return (err);
@@ -886,20 +1748,14 @@
dmu_objset_type_t ostype, dmu_tx_t *tx)
{
dsl_pool_t *dp = scn->scn_dp;
- arc_buf_t *buf = NULL;
- blkptr_t bp_toread = *bp;
+ blkptr_t *bp_toread = NULL;
- /* ASSERT(pbuf == NULL || arc_released(pbuf)); */
-
if (dsl_scan_check_suspend(scn, zb))
return;
if (dsl_scan_check_resume(scn, dnp, zb))
return;
- if (BP_IS_HOLE(bp))
- return;
-
scn->scn_visited_this_txg++;
dprintf_bp(bp,
@@ -908,12 +1764,22 @@
zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid,
bp);
- if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
+ if (BP_IS_HOLE(bp)) {
+ scn->scn_holes_this_txg++;
return;
+ }
- if (dsl_scan_recurse(scn, ds, ostype, dnp, &bp_toread, zb, tx) != 0)
+ if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) {
+ scn->scn_lt_min_this_txg++;
return;
+ }
+ bp_toread = kmem_alloc(sizeof (blkptr_t), KM_SLEEP);
+ *bp_toread = *bp;
+
+ if (dsl_scan_recurse(scn, ds, ostype, dnp, bp_toread, zb, tx) != 0)
+ return;
+
/*
* If dsl_scan_ddt() has already visited this block, it will have
* already done any translations or scrubbing, so don't call the
@@ -921,8 +1787,8 @@
*/
if (ddt_class_contains(dp->dp_spa,
scn->scn_phys.scn_ddt_class_max, bp)) {
- ASSERT(buf == NULL);
- return;
+ scn->scn_ddt_contained_this_txg++;
+ goto out;
}
/*
@@ -932,9 +1798,14 @@
* Don't scan it now unless we need to because something
* under it was modified.
*/
- if (BP_PHYSICAL_BIRTH(bp) <= scn->scn_phys.scn_cur_max_txg) {
- scan_funcs[scn->scn_phys.scn_func](dp, bp, zb);
+ if (BP_PHYSICAL_BIRTH(bp) > scn->scn_phys.scn_cur_max_txg) {
+ scn->scn_gt_max_this_txg++;
+ goto out;
}
+
+ scan_funcs[scn->scn_phys.scn_func](dp, bp, zb);
+out:
+ kmem_free(bp_toread, sizeof (blkptr_t));
}
static void
@@ -942,26 +1813,33 @@
dmu_tx_t *tx)
{
zbookmark_phys_t zb;
+ scan_prefetch_ctx_t *spc;
SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
- dsl_scan_visitbp(bp, &zb, NULL,
- ds, scn, DMU_OST_NONE, tx);
+ if (ZB_IS_ZERO(&scn->scn_phys.scn_bookmark)) {
+ SET_BOOKMARK(&scn->scn_prefetch_bookmark,
+ zb.zb_objset, 0, 0, 0);
+ } else {
+ scn->scn_prefetch_bookmark = scn->scn_phys.scn_bookmark;
+ }
+
+ scn->scn_objsets_visited_this_txg++;
+
+ spc = scan_prefetch_ctx_create(scn, NULL, FTAG);
+ dsl_scan_prefetch(spc, bp, &zb);
+ scan_prefetch_ctx_rele(spc, FTAG);
+
+ dsl_scan_visitbp(bp, &zb, NULL, ds, scn, DMU_OST_NONE, tx);
+
dprintf_ds(ds, "finished scan%s", "");
}
-void
-dsl_scan_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx)
+static void
+ds_destroyed_scn_phys(dsl_dataset_t *ds, dsl_scan_phys_t *scn_phys)
{
- dsl_pool_t *dp = ds->ds_dir->dd_pool;
- dsl_scan_t *scn = dp->dp_scan;
- uint64_t mintxg;
-
- if (scn->scn_phys.scn_state != DSS_SCANNING)
- return;
-
- if (scn->scn_phys.scn_bookmark.zb_objset == ds->ds_object) {
+ if (scn_phys->scn_bookmark.zb_objset == ds->ds_object) {
if (ds->ds_is_snapshot) {
/*
* Note:
@@ -973,23 +1851,57 @@
* ignore it when we retraverse it in
* dsl_scan_visitds().
*/
- scn->scn_phys.scn_bookmark.zb_objset =
+ scn_phys->scn_bookmark.zb_objset =
dsl_dataset_phys(ds)->ds_next_snap_obj;
zfs_dbgmsg("destroying ds %llu; currently traversing; "
"reset zb_objset to %llu",
(u_longlong_t)ds->ds_object,
(u_longlong_t)dsl_dataset_phys(ds)->
ds_next_snap_obj);
- scn->scn_phys.scn_flags |= DSF_VISIT_DS_AGAIN;
+ scn_phys->scn_flags |= DSF_VISIT_DS_AGAIN;
} else {
- SET_BOOKMARK(&scn->scn_phys.scn_bookmark,
+ SET_BOOKMARK(&scn_phys->scn_bookmark,
ZB_DESTROYED_OBJSET, 0, 0, 0);
zfs_dbgmsg("destroying ds %llu; currently traversing; "
"reset bookmark to -1,0,0,0",
(u_longlong_t)ds->ds_object);
}
- } else if (zap_lookup_int_key(dp->dp_meta_objset,
- scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) {
+ }
+}
+
+/*
+ * Invoked when a dataset is destroyed. We need to make sure that:
+ *
+ * 1) If it is the dataset that was currently being scanned, we write
+ * a new dsl_scan_phys_t and marking the objset reference in it
+ * as destroyed.
+ * 2) Remove it from the work queue, if it was present.
+ *
+ * If the dataset was actually a snapshot, instead of marking the dataset
+ * as destroyed, we instead substitute the next snapshot in line.
+ */
+void
+dsl_scan_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+ dsl_pool_t *dp = ds->ds_dir->dd_pool;
+ dsl_scan_t *scn = dp->dp_scan;
+ uint64_t mintxg;
+
+ if (!dsl_scan_is_running(scn))
+ return;
+
+ ds_destroyed_scn_phys(ds, &scn->scn_phys);
+ ds_destroyed_scn_phys(ds, &scn->scn_phys_cached);
+
+ if (scan_ds_queue_contains(scn, ds->ds_object, &mintxg)) {
+ scan_ds_queue_remove(scn, ds->ds_object);
+ if (ds->ds_is_snapshot)
+ scan_ds_queue_insert(scn,
+ dsl_dataset_phys(ds)->ds_next_snap_obj, mintxg);
+ }
+
+ if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj,
+ ds->ds_object, &mintxg) == 0) {
ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1);
VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
scn->scn_phys.scn_queue_obj, ds->ds_object, tx));
@@ -1018,9 +1930,28 @@
* dsl_scan_sync() should be called after this, and should sync
* out our changed state, but just to be safe, do it here.
*/
- dsl_scan_sync_state(scn, tx);
+ dsl_scan_sync_state(scn, tx, SYNC_CACHED);
}
+static void
+ds_snapshotted_bookmark(dsl_dataset_t *ds, zbookmark_phys_t *scn_bookmark)
+{
+ if (scn_bookmark->zb_objset == ds->ds_object) {
+ scn_bookmark->zb_objset =
+ dsl_dataset_phys(ds)->ds_prev_snap_obj;
+ zfs_dbgmsg("snapshotting ds %llu; currently traversing; "
+ "reset zb_objset to %llu",
+ (u_longlong_t)ds->ds_object,
+ (u_longlong_t)dsl_dataset_phys(ds)->ds_prev_snap_obj);
+ }
+}
+
+/*
+ * Called when a dataset is snapshotted. If we were currently traversing
+ * this snapshot, we reset our bookmark to point at the newly created
+ * snapshot. We also modify our work queue to remove the old snapshot and
+ * replace with the new one.
+ */
void
dsl_scan_ds_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx)
{
@@ -1028,20 +1959,22 @@
dsl_scan_t *scn = dp->dp_scan;
uint64_t mintxg;
- if (scn->scn_phys.scn_state != DSS_SCANNING)
+ if (!dsl_scan_is_running(scn))
return;
ASSERT(dsl_dataset_phys(ds)->ds_prev_snap_obj != 0);
- if (scn->scn_phys.scn_bookmark.zb_objset == ds->ds_object) {
- scn->scn_phys.scn_bookmark.zb_objset =
- dsl_dataset_phys(ds)->ds_prev_snap_obj;
- zfs_dbgmsg("snapshotting ds %llu; currently traversing; "
- "reset zb_objset to %llu",
- (u_longlong_t)ds->ds_object,
- (u_longlong_t)dsl_dataset_phys(ds)->ds_prev_snap_obj);
- } else if (zap_lookup_int_key(dp->dp_meta_objset,
- scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) {
+ ds_snapshotted_bookmark(ds, &scn->scn_phys.scn_bookmark);
+ ds_snapshotted_bookmark(ds, &scn->scn_phys_cached.scn_bookmark);
+
+ if (scan_ds_queue_contains(scn, ds->ds_object, &mintxg)) {
+ scan_ds_queue_remove(scn, ds->ds_object);
+ scan_ds_queue_insert(scn,
+ dsl_dataset_phys(ds)->ds_prev_snap_obj, mintxg);
+ }
+
+ if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj,
+ ds->ds_object, &mintxg) == 0) {
VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
scn->scn_phys.scn_queue_obj, ds->ds_object, tx));
VERIFY(zap_add_int_key(dp->dp_meta_objset,
@@ -1052,37 +1985,59 @@
(u_longlong_t)ds->ds_object,
(u_longlong_t)dsl_dataset_phys(ds)->ds_prev_snap_obj);
}
- dsl_scan_sync_state(scn, tx);
+
+ dsl_scan_sync_state(scn, tx, SYNC_CACHED);
}
-void
-dsl_scan_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx)
+static void
+ds_clone_swapped_bookmark(dsl_dataset_t *ds1, dsl_dataset_t *ds2,
+ zbookmark_phys_t *scn_bookmark)
{
- dsl_pool_t *dp = ds1->ds_dir->dd_pool;
- dsl_scan_t *scn = dp->dp_scan;
- uint64_t mintxg;
-
- if (scn->scn_phys.scn_state != DSS_SCANNING)
- return;
-
- if (scn->scn_phys.scn_bookmark.zb_objset == ds1->ds_object) {
- scn->scn_phys.scn_bookmark.zb_objset = ds2->ds_object;
+ if (scn_bookmark->zb_objset == ds1->ds_object) {
+ scn_bookmark->zb_objset = ds2->ds_object;
zfs_dbgmsg("clone_swap ds %llu; currently traversing; "
"reset zb_objset to %llu",
(u_longlong_t)ds1->ds_object,
(u_longlong_t)ds2->ds_object);
- } else if (scn->scn_phys.scn_bookmark.zb_objset == ds2->ds_object) {
- scn->scn_phys.scn_bookmark.zb_objset = ds1->ds_object;
+ } else if (scn_bookmark->zb_objset == ds2->ds_object) {
+ scn_bookmark->zb_objset = ds1->ds_object;
zfs_dbgmsg("clone_swap ds %llu; currently traversing; "
"reset zb_objset to %llu",
(u_longlong_t)ds2->ds_object,
(u_longlong_t)ds1->ds_object);
}
+}
+/*
+ * Called when a parent dataset and its clone are swapped. If we were
+ * currently traversing the dataset, we need to switch to traversing the
+ * newly promoted parent.
+ */
+void
+dsl_scan_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx)
+{
+ dsl_pool_t *dp = ds1->ds_dir->dd_pool;
+ dsl_scan_t *scn = dp->dp_scan;
+ uint64_t mintxg;
+
+ if (!dsl_scan_is_running(scn))
+ return;
+
+ ds_clone_swapped_bookmark(ds1, ds2, &scn->scn_phys.scn_bookmark);
+ ds_clone_swapped_bookmark(ds1, ds2, &scn->scn_phys_cached.scn_bookmark);
+
+ if (scan_ds_queue_contains(scn, ds1->ds_object, &mintxg)) {
+ scan_ds_queue_remove(scn, ds1->ds_object);
+ scan_ds_queue_insert(scn, ds2->ds_object, mintxg);
+ }
+ if (scan_ds_queue_contains(scn, ds2->ds_object, &mintxg)) {
+ scan_ds_queue_remove(scn, ds2->ds_object);
+ scan_ds_queue_insert(scn, ds1->ds_object, mintxg);
+ }
+
if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj,
ds1->ds_object, &mintxg) == 0) {
int err;
-
ASSERT3U(mintxg, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg);
ASSERT3U(mintxg, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg);
VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
@@ -1100,8 +2055,9 @@
"replacing with %llu",
(u_longlong_t)ds1->ds_object,
(u_longlong_t)ds2->ds_object);
- } else if (zap_lookup_int_key(dp->dp_meta_objset,
- scn->scn_phys.scn_queue_obj, ds2->ds_object, &mintxg) == 0) {
+ }
+ if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj,
+ ds2->ds_object, &mintxg) == 0) {
ASSERT3U(mintxg, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg);
ASSERT3U(mintxg, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg);
VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
@@ -1114,31 +2070,26 @@
(u_longlong_t)ds1->ds_object);
}
- dsl_scan_sync_state(scn, tx);
+ dsl_scan_sync_state(scn, tx, SYNC_CACHED);
}
-struct enqueue_clones_arg {
- dmu_tx_t *tx;
- uint64_t originobj;
-};
-
/* ARGSUSED */
static int
enqueue_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
{
- struct enqueue_clones_arg *eca = arg;
+ uint64_t originobj = *(uint64_t *)arg;
dsl_dataset_t *ds;
int err;
dsl_scan_t *scn = dp->dp_scan;
- if (dsl_dir_phys(hds->ds_dir)->dd_origin_obj != eca->originobj)
+ if (dsl_dir_phys(hds->ds_dir)->dd_origin_obj != originobj)
return (0);
err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds);
if (err)
return (err);
- while (dsl_dataset_phys(ds)->ds_prev_snap_obj != eca->originobj) {
+ while (dsl_dataset_phys(ds)->ds_prev_snap_obj != originobj) {
dsl_dataset_t *prev;
err = dsl_dataset_hold_obj(dp,
dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev);
@@ -1148,9 +2099,8 @@
return (err);
ds = prev;
}
- VERIFY(zap_add_int_key(dp->dp_meta_objset,
- scn->scn_phys.scn_queue_obj, ds->ds_object,
- dsl_dataset_phys(ds)->ds_prev_snap_txg, eca->tx) == 0);
+ scan_ds_queue_insert(scn, ds->ds_object,
+ dsl_dataset_phys(ds)->ds_prev_snap_txg);
dsl_dataset_rele(ds, FTAG);
return (0);
}
@@ -1160,6 +2110,7 @@
{
dsl_pool_t *dp = scn->scn_dp;
dsl_dataset_t *ds;
+ objset_t *os;
VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
@@ -1195,14 +2146,17 @@
dsl_dataset_name(ds, dsname);
zfs_dbgmsg("scanning dataset %llu (%s) is unnecessary because "
"cur_min_txg (%llu) >= max_txg (%llu)",
- dsobj, dsname,
- scn->scn_phys.scn_cur_min_txg,
- scn->scn_phys.scn_max_txg);
+ (longlong_t)dsobj, dsname,
+ (longlong_t)scn->scn_phys.scn_cur_min_txg,
+ (longlong_t)scn->scn_phys.scn_max_txg);
kmem_free(dsname, MAXNAMELEN);
goto out;
}
+ if (dmu_objset_from_ds(ds, &os))
+ goto out;
+
/*
* Only the ZIL in the head (non-snapshot) is valid. Even though
* snapshots can have ZIL block pointers (which may be the same
@@ -1212,14 +2166,8 @@
* rather than in scan_recurse(), because the regular snapshot
* block-sharing rules don't apply to it.
*/
- if (DSL_SCAN_IS_SCRUB_RESILVER(scn) && !dsl_dataset_is_snapshot(ds) &&
- ds->ds_dir != dp->dp_origin_snap->ds_dir) {
- objset_t *os;
- if (dmu_objset_from_ds(ds, &os) != 0) {
- goto out;
- }
+ if (!ds->ds_is_snapshot)
dsl_scan_zil(dp, &os->os_zil_header);
- }
/*
* Iterate over the bps in this ds.
@@ -1252,9 +2200,8 @@
if (scn->scn_phys.scn_flags & DSF_VISIT_DS_AGAIN) {
zfs_dbgmsg("incomplete pass; visiting again");
scn->scn_phys.scn_flags &= ~DSF_VISIT_DS_AGAIN;
- VERIFY(zap_add_int_key(dp->dp_meta_objset,
- scn->scn_phys.scn_queue_obj, ds->ds_object,
- scn->scn_phys.scn_cur_max_txg, tx) == 0);
+ scan_ds_queue_insert(scn, ds->ds_object,
+ scn->scn_phys.scn_cur_max_txg);
goto out;
}
@@ -1262,10 +2209,9 @@
* Add descendent datasets to work queue.
*/
if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0) {
- VERIFY(zap_add_int_key(dp->dp_meta_objset,
- scn->scn_phys.scn_queue_obj,
+ scan_ds_queue_insert(scn,
dsl_dataset_phys(ds)->ds_next_snap_obj,
- dsl_dataset_phys(ds)->ds_creation_txg, tx) == 0);
+ dsl_dataset_phys(ds)->ds_creation_txg);
}
if (dsl_dataset_phys(ds)->ds_num_children > 1) {
boolean_t usenext = B_FALSE;
@@ -1286,17 +2232,21 @@
}
if (usenext) {
- VERIFY0(zap_join_key(dp->dp_meta_objset,
- dsl_dataset_phys(ds)->ds_next_clones_obj,
- scn->scn_phys.scn_queue_obj,
- dsl_dataset_phys(ds)->ds_creation_txg, tx));
+ zap_cursor_t zc;
+ zap_attribute_t za;
+ for (zap_cursor_init(&zc, dp->dp_meta_objset,
+ dsl_dataset_phys(ds)->ds_next_clones_obj);
+ zap_cursor_retrieve(&zc, &za) == 0;
+ (void) zap_cursor_advance(&zc)) {
+ scan_ds_queue_insert(scn,
+ zfs_strtonum(za.za_name, NULL),
+ dsl_dataset_phys(ds)->ds_creation_txg);
+ }
+ zap_cursor_fini(&zc);
} else {
- struct enqueue_clones_arg eca;
- eca.tx = tx;
- eca.originobj = ds->ds_object;
-
VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
- enqueue_clones_cb, &eca, DS_FIND_CHILDREN));
+ enqueue_clones_cb, &ds->ds_object,
+ DS_FIND_CHILDREN));
}
}
@@ -1308,7 +2258,6 @@
static int
enqueue_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
{
- dmu_tx_t *tx = arg;
dsl_dataset_t *ds;
int err;
dsl_scan_t *scn = dp->dp_scan;
@@ -1338,12 +2287,37 @@
ds = prev;
}
- VERIFY(zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj,
- ds->ds_object, dsl_dataset_phys(ds)->ds_prev_snap_txg, tx) == 0);
+ scan_ds_queue_insert(scn, ds->ds_object,
+ dsl_dataset_phys(ds)->ds_prev_snap_txg);
dsl_dataset_rele(ds, FTAG);
return (0);
}
+/* ARGSUSED */
+void
+dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
+ ddt_entry_t *dde, dmu_tx_t *tx)
+{
+ const ddt_key_t *ddk = &dde->dde_key;
+ ddt_phys_t *ddp = dde->dde_phys;
+ blkptr_t bp;
+ zbookmark_phys_t zb = { 0 };
+ int p;
+
+ if (scn->scn_phys.scn_state != DSS_SCANNING)
+ return;
+
+ for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+ if (ddp->ddp_phys_birth == 0 ||
+ ddp->ddp_phys_birth > scn->scn_phys.scn_max_txg)
+ continue;
+ ddt_bp_create(checksum, ddk, ddp, &bp);
+
+ scn->scn_visited_this_txg++;
+ scan_funcs[scn->scn_phys.scn_func](scn->scn_dp, &bp, &zb);
+ }
+}
+
/*
* Scrub/dedup interaction.
*
@@ -1416,36 +2390,20 @@
ddb->ddb_class > scn->scn_phys.scn_ddt_class_max);
}
-/* ARGSUSED */
-void
-dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
- ddt_entry_t *dde, dmu_tx_t *tx)
+static uint64_t
+dsl_scan_ds_maxtxg(dsl_dataset_t *ds)
{
- const ddt_key_t *ddk = &dde->dde_key;
- ddt_phys_t *ddp = dde->dde_phys;
- blkptr_t bp;
- zbookmark_phys_t zb = { 0 };
-
- if (scn->scn_phys.scn_state != DSS_SCANNING)
- return;
-
- for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
- if (ddp->ddp_phys_birth == 0 ||
- ddp->ddp_phys_birth > scn->scn_phys.scn_max_txg)
- continue;
- ddt_bp_create(checksum, ddk, ddp, &bp);
-
- scn->scn_visited_this_txg++;
- scan_funcs[scn->scn_phys.scn_func](scn->scn_dp, &bp, &zb);
- }
+ uint64_t smt = ds->ds_dir->dd_pool->dp_scan->scn_phys.scn_max_txg;
+ if (ds->ds_is_snapshot)
+ return (MIN(smt, dsl_dataset_phys(ds)->ds_creation_txg));
+ return (smt);
}
static void
dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx)
{
+ scan_ds_t *sds;
dsl_pool_t *dp = scn->scn_dp;
- zap_cursor_t zc;
- zap_attribute_t za;
if (scn->scn_phys.scn_ddt_bookmark.ddb_class <=
scn->scn_phys.scn_ddt_class_max) {
@@ -1469,7 +2427,7 @@
if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) {
VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
- enqueue_cb, tx, DS_FIND_CHILDREN));
+ enqueue_cb, NULL, DS_FIND_CHILDREN));
} else {
dsl_scan_visitds(scn,
dp->dp_origin_snap->ds_object, tx);
@@ -1477,40 +2435,42 @@
ASSERT(!scn->scn_suspending);
} else if (scn->scn_phys.scn_bookmark.zb_objset !=
ZB_DESTROYED_OBJSET) {
+ uint64_t dsobj = scn->scn_phys.scn_bookmark.zb_objset;
/*
- * If we were suspended, continue from here. Note if the
+ * If we were suspended, continue from here. Note if the
* ds we were suspended on was deleted, the zb_objset may
* be -1, so we will skip this and find a new objset
* below.
*/
- dsl_scan_visitds(scn, scn->scn_phys.scn_bookmark.zb_objset, tx);
+ dsl_scan_visitds(scn, dsobj, tx);
if (scn->scn_suspending)
return;
}
/*
- * In case we were suspended right at the end of the ds, zero the
+ * In case we suspended right at the end of the ds, zero the
* bookmark so we don't think that we're still trying to resume.
*/
bzero(&scn->scn_phys.scn_bookmark, sizeof (zbookmark_phys_t));
- /* keep pulling things out of the zap-object-as-queue */
- while (zap_cursor_init(&zc, dp->dp_meta_objset,
- scn->scn_phys.scn_queue_obj),
- zap_cursor_retrieve(&zc, &za) == 0) {
+ /*
+ * Keep pulling things out of the dataset avl queue. Updates to the
+ * persistent zap-object-as-queue happen only at checkpoints.
+ */
+ while ((sds = avl_first(&scn->scn_queue)) != NULL) {
dsl_dataset_t *ds;
- uint64_t dsobj;
+ uint64_t dsobj = sds->sds_dsobj;
+ uint64_t txg = sds->sds_txg;
- dsobj = zfs_strtonum(za.za_name, NULL);
- VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
- scn->scn_phys.scn_queue_obj, dsobj, tx));
+ /* dequeue and free the ds from the queue */
+ scan_ds_queue_remove(scn, dsobj);
+ sds = NULL; /* must not be touched after removal */
- /* Set up min/max txg */
+ /* Set up min / max txg */
VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
- if (za.za_first_integer != 0) {
+ if (txg != 0) {
scn->scn_phys.scn_cur_min_txg =
- MAX(scn->scn_phys.scn_min_txg,
- za.za_first_integer);
+ MAX(scn->scn_phys.scn_min_txg, txg);
} else {
scn->scn_phys.scn_cur_min_txg =
MAX(scn->scn_phys.scn_min_txg,
@@ -1520,14 +2480,367 @@
dsl_dataset_rele(ds, FTAG);
dsl_scan_visitds(scn, dsobj, tx);
- zap_cursor_fini(&zc);
if (scn->scn_suspending)
return;
}
- zap_cursor_fini(&zc);
+ /* No more objsets to fetch, we're done */
+ scn->scn_phys.scn_bookmark.zb_objset = ZB_DESTROYED_OBJSET;
+ ASSERT0(scn->scn_suspending);
}
+static uint64_t
+dsl_scan_count_leaves(vdev_t *vd)
+{
+ uint64_t i, leaves = 0;
+
+ /* we only count leaves that belong to the main pool and are readable */
+ if (vd->vdev_islog || vd->vdev_isspare ||
+ vd->vdev_isl2cache || !vdev_readable(vd))
+ return (0);
+
+ if (vd->vdev_ops->vdev_op_leaf)
+ return (1);
+
+ for (i = 0; i < vd->vdev_children; i++) {
+ leaves += dsl_scan_count_leaves(vd->vdev_child[i]);
+ }
+
+ return (leaves);
+}
+
+
+static void
+scan_io_queues_update_zio_stats(dsl_scan_io_queue_t *q, const blkptr_t *bp)
+{
+ int i;
+ uint64_t cur_size = 0;
+
+ for (i = 0; i < BP_GET_NDVAS(bp); i++) {
+ cur_size += DVA_GET_ASIZE(&bp->blk_dva[i]);
+ }
+
+ q->q_total_zio_size_this_txg += cur_size;
+ q->q_zios_this_txg++;
+}
+
+static void
+scan_io_queues_update_seg_stats(dsl_scan_io_queue_t *q, uint64_t start,
+ uint64_t end)
+{
+ q->q_total_seg_size_this_txg += end - start;
+ q->q_segs_this_txg++;
+}
+
static boolean_t
+scan_io_queue_check_suspend(dsl_scan_t *scn)
+{
+ /* See comment in dsl_scan_check_suspend() */
+ uint64_t curr_time_ns = gethrtime();
+ uint64_t scan_time_ns = curr_time_ns - scn->scn_sync_start_time;
+ uint64_t sync_time_ns = curr_time_ns -
+ scn->scn_dp->dp_spa->spa_sync_starttime;
+ int dirty_pct = scn->scn_dp->dp_dirty_total * 100 / zfs_dirty_data_max;
+ int mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ?
+ zfs_resilver_min_time_ms : zfs_scrub_min_time_ms;
+
+ return ((NSEC2MSEC(scan_time_ns) > mintime &&
+ (dirty_pct >= zfs_vdev_async_write_active_min_dirty_percent ||
+ txg_sync_waiting(scn->scn_dp) ||
+ NSEC2SEC(sync_time_ns) >= zfs_txg_timeout)) ||
+ spa_shutting_down(scn->scn_dp->dp_spa));
+}
+
+/*
+ * Given a list of scan_io_t's in io_list, this issues the io's out to
+ * disk. This consumes the io_list and frees the scan_io_t's. This is
+ * called when emptying queues, either when we're up against the memory
+ * limit or when we have finished scanning. Returns B_TRUE if we stopped
+ * processing the list before we finished. Any zios that were not issued
+ * will remain in the io_list.
+ */
+static boolean_t
+scan_io_queue_issue(dsl_scan_io_queue_t *queue, list_t *io_list)
+{
+ dsl_scan_t *scn = queue->q_scn;
+ scan_io_t *sio;
+ int64_t bytes_issued = 0;
+ boolean_t suspended = B_FALSE;
+
+ while ((sio = list_head(io_list)) != NULL) {
+ blkptr_t bp;
+
+ if (scan_io_queue_check_suspend(scn)) {
+ suspended = B_TRUE;
+ break;
+ }
+
+ sio2bp(sio, &bp, queue->q_vd->vdev_id);
+ bytes_issued += sio->sio_asize;
+ scan_exec_io(scn->scn_dp, &bp, sio->sio_flags,
+ &sio->sio_zb, queue);
+ (void) list_remove_head(io_list);
+ scan_io_queues_update_zio_stats(queue, &bp);
+ kmem_free(sio, sizeof (*sio));
+ }
+
+ atomic_add_64(&scn->scn_bytes_pending, -bytes_issued);
+
+ return (suspended);
+}
+
+/*
+ * Given a range_seg_t (extent) and a list, this function passes over a
+ * scan queue and gathers up the appropriate ios which fit into that
+ * scan seg (starting from lowest LBA). At the end, we remove the segment
+ * from the q_exts_by_addr range tree.
+ */
+static boolean_t
+scan_io_queue_gather(dsl_scan_io_queue_t *queue, range_seg_t *rs, list_t *list)
+{
+ scan_io_t srch_sio, *sio, *next_sio;
+ avl_index_t idx;
+ uint_t num_sios = 0;
+ int64_t bytes_issued = 0;
+
+ ASSERT(rs != NULL);
+ ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
+
+ srch_sio.sio_offset = rs->rs_start;
+
+ /*
+ * The exact start of the extent might not contain any matching zios,
+ * so if that's the case, examine the next one in the tree.
+ */
+ sio = avl_find(&queue->q_sios_by_addr, &srch_sio, &idx);
+ if (sio == NULL)
+ sio = avl_nearest(&queue->q_sios_by_addr, idx, AVL_AFTER);
+
+ while (sio != NULL && sio->sio_offset < rs->rs_end && num_sios <= 32) {
+ ASSERT3U(sio->sio_offset, >=, rs->rs_start);
+ ASSERT3U(sio->sio_offset + sio->sio_asize, <=, rs->rs_end);
+
+ next_sio = AVL_NEXT(&queue->q_sios_by_addr, sio);
+ avl_remove(&queue->q_sios_by_addr, sio);
+
+ bytes_issued += sio->sio_asize;
+ num_sios++;
+ list_insert_tail(list, sio);
+ sio = next_sio;
+ }
+
+ /*
+ * We limit the number of sios we process at once to 32 to avoid
+ * biting off more than we can chew. If we didn't take everything
+ * in the segment we update it to reflect the work we were able to
+ * complete. Otherwise, we remove it from the range tree entirely.
+ */
+ if (sio != NULL && sio->sio_offset < rs->rs_end) {
+ range_tree_adjust_fill(queue->q_exts_by_addr, rs,
+ -bytes_issued);
+ range_tree_resize_segment(queue->q_exts_by_addr, rs,
+ sio->sio_offset, rs->rs_end - sio->sio_offset);
+
+ return (B_TRUE);
+ } else {
+ range_tree_remove(queue->q_exts_by_addr, rs->rs_start,
+ rs->rs_end - rs->rs_start);
+ return (B_FALSE);
+ }
+}
+
+
+/*
+ * This is called from the queue emptying thread and selects the next
+ * extent from which we are to issue io's. The behavior of this function
+ * depends on the state of the scan, the current memory consumption and
+ * whether or not we are performing a scan shutdown.
+ * 1) We select extents in an elevator algorithm (LBA-order) if the scan
+ * needs to perform a checkpoint
+ * 2) We select the largest available extent if we are up against the
+ * memory limit.
+ * 3) Otherwise we don't select any extents.
+ */
+static const range_seg_t *
+scan_io_queue_fetch_ext(dsl_scan_io_queue_t *queue)
+{
+ dsl_scan_t *scn = queue->q_scn;
+
+ ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
+ ASSERT(scn->scn_is_sorted);
+
+ /* handle tunable overrides */
+ if (scn->scn_checkpointing || scn->scn_clearing) {
+ if (zfs_scan_issue_strategy == 1) {
+ return (range_tree_first(queue->q_exts_by_addr));
+ } else if (zfs_scan_issue_strategy == 2) {
+ return (avl_first(&queue->q_exts_by_size));
+ }
+ }
+
+ /*
+ * During normal clearing, we want to issue our largest segments
+ * first, keeping IO as sequential as possible, and leaving the
+ * smaller extents for later with the hope that they might eventually
+ * grow to larger sequential segments. However, when the scan is
+ * checkpointing, no new extents will be added to the sorting queue,
+ * so the way we are sorted now is as good as it will ever get.
+ * In this case, we instead switch to issuing extents in LBA order.
+ */
+ if (scn->scn_checkpointing) {
+ return (range_tree_first(queue->q_exts_by_addr));
+ } else if (scn->scn_clearing) {
+ return (avl_first(&queue->q_exts_by_size));
+ } else {
+ return (NULL);
+ }
+}
+
+static void
+scan_io_queues_run_one(void *arg)
+{
+ dsl_scan_io_queue_t *queue = arg;
+ kmutex_t *q_lock = &queue->q_vd->vdev_scan_io_queue_lock;
+ boolean_t suspended = B_FALSE;
+ range_seg_t *rs = NULL;
+ scan_io_t *sio = NULL;
+ list_t sio_list;
+ uint64_t bytes_per_leaf = zfs_scan_vdev_limit;
+ uint64_t nr_leaves = dsl_scan_count_leaves(queue->q_vd);
+
+ ASSERT(queue->q_scn->scn_is_sorted);
+
+ list_create(&sio_list, sizeof (scan_io_t),
+ offsetof(scan_io_t, sio_nodes.sio_list_node));
+ mutex_enter(q_lock);
+
+ /* calculate maximum in-flight bytes for this txg (min 1MB) */
+ queue->q_maxinflight_bytes =
+ MAX(nr_leaves * bytes_per_leaf, 1ULL << 20);
+
+ /* reset per-queue scan statistics for this txg */
+ queue->q_total_seg_size_this_txg = 0;
+ queue->q_segs_this_txg = 0;
+ queue->q_total_zio_size_this_txg = 0;
+ queue->q_zios_this_txg = 0;
+
+ /* loop until we have run out of time or sios */
+ while ((rs = (range_seg_t*)scan_io_queue_fetch_ext(queue)) != NULL) {
+ uint64_t seg_start = 0, seg_end = 0;
+ boolean_t more_left = B_TRUE;
+
+ ASSERT(list_is_empty(&sio_list));
+
+ /* loop while we still have sios left to process in this rs */
+ while (more_left) {
+ scan_io_t *first_sio, *last_sio;
+
+ /*
+ * We have selected which extent needs to be
+ * processed next. Gather up the corresponding sios.
+ */
+ more_left = scan_io_queue_gather(queue, rs, &sio_list);
+ ASSERT(!list_is_empty(&sio_list));
+ first_sio = list_head(&sio_list);
+ last_sio = list_tail(&sio_list);
+
+ seg_end = last_sio->sio_offset + last_sio->sio_asize;
+ if (seg_start == 0)
+ seg_start = first_sio->sio_offset;
+
+ /*
+ * Issuing sios can take a long time so drop the
+ * queue lock. The sio queue won't be updated by
+ * other threads since we're in syncing context so
+ * we can be sure that our trees will remain exactly
+ * as we left them.
+ */
+ mutex_exit(q_lock);
+ suspended = scan_io_queue_issue(queue, &sio_list);
+ mutex_enter(q_lock);
+
+ if (suspended)
+ break;
+ }
+ /* update statistics for debugging purposes */
+ scan_io_queues_update_seg_stats(queue, seg_start, seg_end);
+
+ if (suspended)
+ break;
+ }
+
+
+ /* If we were suspended in the middle of processing,
+ * requeue any unfinished sios and exit.
+ */
+ while ((sio = list_head(&sio_list)) != NULL) {
+ list_remove(&sio_list, sio);
+ scan_io_queue_insert_impl(queue, sio);
+ }
+
+ mutex_exit(q_lock);
+ list_destroy(&sio_list);
+}
+
+/*
+ * Performs an emptying run on all scan queues in the pool. This just
+ * punches out one thread per top-level vdev, each of which processes
+ * only that vdev's scan queue. We can parallelize the I/O here because
+ * we know that each queue's io's only affect its own top-level vdev.
+ *
+ * This function waits for the queue runs to complete, and must be
+ * called from dsl_scan_sync (or in general, syncing context).
+ */
+static void
+scan_io_queues_run(dsl_scan_t *scn)
+{
+ spa_t *spa = scn->scn_dp->dp_spa;
+
+ ASSERT(scn->scn_is_sorted);
+ ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
+
+ if (scn->scn_bytes_pending == 0)
+ return;
+
+ if (scn->scn_taskq == NULL) {
+ char *tq_name = kmem_zalloc(ZFS_MAX_DATASET_NAME_LEN + 16,
+ KM_SLEEP);
+ int nthreads = spa->spa_root_vdev->vdev_children;
+
+ /*
+ * We need to make this taskq *always* execute as many
+ * threads in parallel as we have top-level vdevs and no
+ * less, otherwise strange serialization of the calls to
+ * scan_io_queues_run_one can occur during spa_sync runs
+ * and that significantly impacts performance.
+ */
+ (void) snprintf(tq_name, ZFS_MAX_DATASET_NAME_LEN + 16,
+ "dsl_scan_tq_%s", spa->spa_name);
+ scn->scn_taskq = taskq_create(tq_name, nthreads, minclsyspri,
+ nthreads, nthreads, TASKQ_PREPOPULATE);
+ kmem_free(tq_name, ZFS_MAX_DATASET_NAME_LEN + 16);
+ }
+
+ for (uint64_t i = 0; i < spa->spa_root_vdev->vdev_children; i++) {
+ vdev_t *vd = spa->spa_root_vdev->vdev_child[i];
+
+ mutex_enter(&vd->vdev_scan_io_queue_lock);
+ if (vd->vdev_scan_io_queue != NULL) {
+ VERIFY(taskq_dispatch(scn->scn_taskq,
+ scan_io_queues_run_one, vd->vdev_scan_io_queue,
+ TQ_SLEEP) != TASKQID_INVALID);
+ }
+ mutex_exit(&vd->vdev_scan_io_queue_lock);
+ }
+
+ /*
+ * Wait for the queues to finish issuing thir IOs for this run
+ * before we return. There may still be IOs in flight at this
+ * point.
+ */
+ taskq_wait(scn->scn_taskq);
+}
+
+static boolean_t
dsl_scan_async_block_should_pause(dsl_scan_t *scn)
{
uint64_t elapsed_nanosecs;
@@ -1581,6 +2894,41 @@
return (0);
}
+static void
+dsl_scan_update_stats(dsl_scan_t *scn)
+{
+ spa_t *spa = scn->scn_dp->dp_spa;
+ uint64_t i;
+ uint64_t seg_size_total = 0, zio_size_total = 0;
+ uint64_t seg_count_total = 0, zio_count_total = 0;
+
+ for (i = 0; i < spa->spa_root_vdev->vdev_children; i++) {
+ vdev_t *vd = spa->spa_root_vdev->vdev_child[i];
+ dsl_scan_io_queue_t *queue = vd->vdev_scan_io_queue;
+
+ if (queue == NULL)
+ continue;
+
+ seg_size_total += queue->q_total_seg_size_this_txg;
+ zio_size_total += queue->q_total_zio_size_this_txg;
+ seg_count_total += queue->q_segs_this_txg;
+ zio_count_total += queue->q_zios_this_txg;
+ }
+
+ if (seg_count_total == 0 || zio_count_total == 0) {
+ scn->scn_avg_seg_size_this_txg = 0;
+ scn->scn_avg_zio_size_this_txg = 0;
+ scn->scn_segs_this_txg = 0;
+ scn->scn_zios_this_txg = 0;
+ return;
+ }
+
+ scn->scn_avg_seg_size_this_txg = seg_size_total / seg_count_total;
+ scn->scn_avg_zio_size_this_txg = zio_size_total / zio_count_total;
+ scn->scn_segs_this_txg = seg_count_total;
+ scn->scn_zios_this_txg = zio_count_total;
+}
+
boolean_t
dsl_scan_active(dsl_scan_t *scn)
{
@@ -1591,8 +2939,7 @@
return (B_FALSE);
if (spa_shutting_down(spa))
return (B_FALSE);
- if ((scn->scn_phys.scn_state == DSS_SCANNING &&
- !dsl_scan_is_paused_scrub(scn)) ||
+ if ((dsl_scan_is_running(scn) && !dsl_scan_is_paused_scrub(scn)) ||
(scn->scn_async_destroying && !scn->scn_async_stalled))
return (B_TRUE);
@@ -1614,14 +2961,15 @@
return (0);
if (zfs_free_bpobj_enabled &&
- spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
+ spa_version(spa) >= SPA_VERSION_DEADLISTS) {
scn->scn_is_bptree = B_FALSE;
scn->scn_async_block_min_time_ms = zfs_free_min_time_ms;
- scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
+ scn->scn_zio_root = zio_root(spa, NULL,
NULL, ZIO_FLAG_MUSTSUCCEED);
err = bpobj_iterate(&dp->dp_free_bpobj,
dsl_scan_free_block_cb, scn, tx);
- VERIFY3U(0, ==, zio_wait(scn->scn_zio_root));
+ VERIFY0(zio_wait(scn->scn_zio_root));
+ scn->scn_zio_root = NULL;
if (err != 0 && err != ERESTART)
zfs_panic_recover("error %u from bpobj_iterate()", err);
@@ -1630,11 +2978,12 @@
if (err == 0 && spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) {
ASSERT(scn->scn_async_destroying);
scn->scn_is_bptree = B_TRUE;
- scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
+ scn->scn_zio_root = zio_root(spa, NULL,
NULL, ZIO_FLAG_MUSTSUCCEED);
err = bptree_iterate(dp->dp_meta_objset,
dp->dp_bptree_obj, B_TRUE, dsl_scan_free_block_cb, scn, tx);
VERIFY0(zio_wait(scn->scn_zio_root));
+ scn->scn_zio_root = NULL;
if (err == EIO || err == ECKSUM) {
err = 0;
@@ -1743,12 +3092,60 @@
return (0);
}
+static boolean_t
+dsl_scan_need_resilver(spa_t *spa, const dva_t *dva, size_t psize,
+ uint64_t phys_birth)
+{
+ vdev_t *vd;
+
+ if (DVA_GET_GANG(dva)) {
+ /*
+ * Gang members may be spread across multiple
+ * vdevs, so the best estimate we have is the
+ * scrub range, which has already been checked.
+ * XXX -- it would be better to change our
+ * allocation policy to ensure that all
+ * gang members reside on the same vdev.
+ */
+ return (B_TRUE);
+ }
+
+ vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva));
+
+ /*
+ * Check if the txg falls within the range which must be
+ * resilvered. DVAs outside this range can always be skipped.
+ */
+ if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1))
+ return (B_FALSE);
+
+ /*
+ * Check if the top-level vdev must resilver this offset.
+ * When the offset does not intersect with a dirty leaf DTL
+ * then it may be possible to skip the resilver IO. The psize
+ * is provided instead of asize to simplify the check for RAIDZ.
+ */
+ if (!vdev_dtl_need_resilver(vd, DVA_GET_OFFSET(dva), psize))
+ return (B_FALSE);
+
+ return (B_TRUE);
+}
+
+/*
+ * This is the primary entry point for scans that is called from syncing
+ * context. Scans must happen entirely during syncing context so that we
+ * cna guarantee that blocks we are currently scanning will not change out
+ * from under us. While a scan is active, this funciton controls how quickly
+ * transaction groups proceed, instead of the normal handling provided by
+ * txg_sync_thread().
+ */
void
dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
{
+ int err = 0;
dsl_scan_t *scn = dp->dp_scan;
spa_t *spa = dp->dp_spa;
- int err = 0;
+ state_sync_type_t sync_type = SYNC_OPTIONAL;
/*
* Check for scn_restart_txg before checking spa_load_state, so
@@ -1761,7 +3158,7 @@
if (vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL))
func = POOL_SCAN_RESILVER;
zfs_dbgmsg("restarting scan func=%u txg=%llu",
- func, tx->tx_txg);
+ func, (longlong_t)tx->tx_txg);
dsl_scan_setup_sync(&func, tx);
}
@@ -1785,7 +3182,17 @@
if (!scn->scn_async_stalled && !dsl_scan_active(scn))
return;
+ /* reset scan statistics */
scn->scn_visited_this_txg = 0;
+ scn->scn_holes_this_txg = 0;
+ scn->scn_lt_min_this_txg = 0;
+ scn->scn_gt_max_this_txg = 0;
+ scn->scn_ddt_contained_this_txg = 0;
+ scn->scn_objsets_visited_this_txg = 0;
+ scn->scn_avg_seg_size_this_txg = 0;
+ scn->scn_segs_this_txg = 0;
+ scn->scn_avg_zio_size_this_txg = 0;
+ scn->scn_zios_this_txg = 0;
scn->scn_suspending = B_FALSE;
scn->scn_sync_start_time = gethrtime();
spa->spa_scrub_active = B_TRUE;
@@ -1801,110 +3208,189 @@
if (err != 0)
return;
- if (scn->scn_phys.scn_state != DSS_SCANNING)
+ if (!dsl_scan_is_running(scn) || dsl_scan_is_paused_scrub(scn))
return;
- if (scn->scn_done_txg == tx->tx_txg) {
- ASSERT(!scn->scn_suspending);
- /* finished with scan. */
- zfs_dbgmsg("txg %llu scan complete", tx->tx_txg);
- dsl_scan_done(scn, B_TRUE, tx);
- ASSERT3U(spa->spa_scrub_inflight, ==, 0);
- dsl_scan_sync_state(scn, tx);
+ /*
+ * Wait a few txgs after importing to begin scanning so that
+ * we can get the pool imported quickly.
+ */
+ if (spa->spa_syncing_txg < spa->spa_first_txg + SCAN_IMPORT_WAIT_TXGS)
return;
+
+ /*
+ * It is possible to switch from unsorted to sorted at any time,
+ * but afterwards the scan will remain sorted unless reloaded from
+ * a checkpoint after a reboot.
+ */
+ if (!zfs_scan_legacy) {
+ scn->scn_is_sorted = B_TRUE;
+ if (scn->scn_last_checkpoint == 0)
+ scn->scn_last_checkpoint = ddi_get_lbolt();
}
- if (dsl_scan_is_paused_scrub(scn))
- return;
+ /*
+ * For sorted scans, determine what kind of work we will be doing
+ * this txg based on our memory limitations and whether or not we
+ * need to perform a checkpoint.
+ */
+ if (scn->scn_is_sorted) {
+ /*
+ * If we are over our checkpoint interval, set scn_clearing
+ * so that we can begin checkpointing immediately. The
+ * checkpoint allows us to save a consisent bookmark
+ * representing how much data we have scrubbed so far.
+ * Otherwise, use the memory limit to determine if we should
+ * scan for metadata or start issue scrub IOs. We accumulate
+ * metadata until we hit our hard memory limit at which point
+ * we issue scrub IOs until we are at our soft memory limit.
+ */
+ if (scn->scn_checkpointing ||
+ ddi_get_lbolt() - scn->scn_last_checkpoint >
+ SEC_TO_TICK(zfs_scan_checkpoint_intval)) {
+ if (!scn->scn_checkpointing)
+ zfs_dbgmsg("begin scan checkpoint");
- if (scn->scn_phys.scn_ddt_bookmark.ddb_class <=
- scn->scn_phys.scn_ddt_class_max) {
- zfs_dbgmsg("doing scan sync txg %llu; "
- "ddt bm=%llu/%llu/%llu/%llx",
- (longlong_t)tx->tx_txg,
- (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_class,
- (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_type,
- (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_checksum,
- (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_cursor);
- ASSERT(scn->scn_phys.scn_bookmark.zb_objset == 0);
- ASSERT(scn->scn_phys.scn_bookmark.zb_object == 0);
- ASSERT(scn->scn_phys.scn_bookmark.zb_level == 0);
- ASSERT(scn->scn_phys.scn_bookmark.zb_blkid == 0);
+ scn->scn_checkpointing = B_TRUE;
+ scn->scn_clearing = B_TRUE;
+ } else {
+ boolean_t should_clear = dsl_scan_should_clear(scn);
+ if (should_clear && !scn->scn_clearing) {
+ zfs_dbgmsg("begin scan clearing");
+ scn->scn_clearing = B_TRUE;
+ } else if (!should_clear && scn->scn_clearing) {
+ zfs_dbgmsg("finish scan clearing");
+ scn->scn_clearing = B_FALSE;
+ }
+ }
} else {
- zfs_dbgmsg("doing scan sync txg %llu; bm=%llu/%llu/%llu/%llu",
- (longlong_t)tx->tx_txg,
- (longlong_t)scn->scn_phys.scn_bookmark.zb_objset,
- (longlong_t)scn->scn_phys.scn_bookmark.zb_object,
- (longlong_t)scn->scn_phys.scn_bookmark.zb_level,
- (longlong_t)scn->scn_phys.scn_bookmark.zb_blkid);
+ ASSERT0(scn->scn_checkpointing);
+ ASSERT0(scn->scn_clearing);
}
- scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
- NULL, ZIO_FLAG_CANFAIL);
- dsl_pool_config_enter(dp, FTAG);
- dsl_scan_visit(scn, tx);
- dsl_pool_config_exit(dp, FTAG);
- (void) zio_wait(scn->scn_zio_root);
- scn->scn_zio_root = NULL;
+ if (!scn->scn_clearing && scn->scn_done_txg == 0) {
+ /* Need to scan metadata for more blocks to scrub */
+ dsl_scan_phys_t *scnp = &scn->scn_phys;
+ taskqid_t prefetch_tqid;
+ uint64_t bytes_per_leaf = zfs_scan_vdev_limit;
+ uint64_t nr_leaves = dsl_scan_count_leaves(spa->spa_root_vdev);
- zfs_dbgmsg("visited %llu blocks in %llums",
- (longlong_t)scn->scn_visited_this_txg,
- (longlong_t)NSEC2MSEC(gethrtime() - scn->scn_sync_start_time));
+ /*
+ * Calculate the max number of in-flight bytes for pool-wide
+ * scanning operations (minimum 1MB). Limits for the issuing
+ * phase are done per top-level vdev and are handled separately.
+ */
+ scn->scn_maxinflight_bytes =
+ MAX(nr_leaves * bytes_per_leaf, 1ULL << 20);
- if (!scn->scn_suspending) {
- scn->scn_done_txg = tx->tx_txg + 1;
- zfs_dbgmsg("txg %llu traversal complete, waiting till txg %llu",
- tx->tx_txg, scn->scn_done_txg);
- }
-
- if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
- mutex_enter(&spa->spa_scrub_lock);
- while (spa->spa_scrub_inflight > 0) {
- cv_wait(&spa->spa_scrub_io_cv,
- &spa->spa_scrub_lock);
+ if (scnp->scn_ddt_bookmark.ddb_class <=
+ scnp->scn_ddt_class_max) {
+ ASSERT(ZB_IS_ZERO(&scnp->scn_bookmark));
+ zfs_dbgmsg("doing scan sync txg %llu; "
+ "ddt bm=%llu/%llu/%llu/%llx",
+ (longlong_t)tx->tx_txg,
+ (longlong_t)scnp->scn_ddt_bookmark.ddb_class,
+ (longlong_t)scnp->scn_ddt_bookmark.ddb_type,
+ (longlong_t)scnp->scn_ddt_bookmark.ddb_checksum,
+ (longlong_t)scnp->scn_ddt_bookmark.ddb_cursor);
+ } else {
+ zfs_dbgmsg("doing scan sync txg %llu; "
+ "bm=%llu/%llu/%llu/%llu",
+ (longlong_t)tx->tx_txg,
+ (longlong_t)scnp->scn_bookmark.zb_objset,
+ (longlong_t)scnp->scn_bookmark.zb_object,
+ (longlong_t)scnp->scn_bookmark.zb_level,
+ (longlong_t)scnp->scn_bookmark.zb_blkid);
}
- mutex_exit(&spa->spa_scrub_lock);
- }
- dsl_scan_sync_state(scn, tx);
-}
+ scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
+ NULL, ZIO_FLAG_CANFAIL);
-/*
- * This will start a new scan, or restart an existing one.
- */
-void
-dsl_resilver_restart(dsl_pool_t *dp, uint64_t txg)
-{
- if (txg == 0) {
- dmu_tx_t *tx;
- tx = dmu_tx_create_dd(dp->dp_mos_dir);
- VERIFY(0 == dmu_tx_assign(tx, TXG_WAIT));
+ scn->scn_prefetch_stop = B_FALSE;
+ prefetch_tqid = taskq_dispatch(dp->dp_sync_taskq,
+ dsl_scan_prefetch_thread, scn, TQ_SLEEP);
+ ASSERT(prefetch_tqid != TASKQID_INVALID);
- txg = dmu_tx_get_txg(tx);
- dp->dp_scan->scn_restart_txg = txg;
- dmu_tx_commit(tx);
- } else {
- dp->dp_scan->scn_restart_txg = txg;
+ dsl_pool_config_enter(dp, FTAG);
+ dsl_scan_visit(scn, tx);
+ dsl_pool_config_exit(dp, FTAG);
+
+ mutex_enter(&dp->dp_spa->spa_scrub_lock);
+ scn->scn_prefetch_stop = B_TRUE;
+ cv_broadcast(&spa->spa_scrub_io_cv);
+ mutex_exit(&dp->dp_spa->spa_scrub_lock);
+
+ taskq_wait_id(dp->dp_sync_taskq, prefetch_tqid);
+ (void) zio_wait(scn->scn_zio_root);
+ scn->scn_zio_root = NULL;
+
+ zfs_dbgmsg("scan visited %llu blocks in %llums "
+ "(%llu os's, %llu holes, %llu < mintxg, "
+ "%llu in ddt, %llu > maxtxg)",
+ (longlong_t)scn->scn_visited_this_txg,
+ (longlong_t)NSEC2MSEC(gethrtime() -
+ scn->scn_sync_start_time),
+ (longlong_t)scn->scn_objsets_visited_this_txg,
+ (longlong_t)scn->scn_holes_this_txg,
+ (longlong_t)scn->scn_lt_min_this_txg,
+ (longlong_t)scn->scn_ddt_contained_this_txg,
+ (longlong_t)scn->scn_gt_max_this_txg);
+
+ if (!scn->scn_suspending) {
+ ASSERT0(avl_numnodes(&scn->scn_queue));
+ scn->scn_done_txg = tx->tx_txg + 1;
+ if (scn->scn_is_sorted) {
+ scn->scn_checkpointing = B_TRUE;
+ scn->scn_clearing = B_TRUE;
+ }
+ zfs_dbgmsg("scan complete txg %llu",
+ (longlong_t)tx->tx_txg);
+ }
+ } else if (scn->scn_is_sorted && scn->scn_bytes_pending != 0) {
+ /* need to issue scrubbing IOs from per-vdev queues */
+ scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
+ NULL, ZIO_FLAG_CANFAIL);
+ scan_io_queues_run(scn);
+ (void) zio_wait(scn->scn_zio_root);
+ scn->scn_zio_root = NULL;
+
+ /* calculate and dprintf the current memory usage */
+ (void) dsl_scan_should_clear(scn);
+ dsl_scan_update_stats(scn);
+
+ zfs_dbgmsg("scrubbed %llu blocks (%llu segs) in %llums "
+ "(avg_block_size = %llu, avg_seg_size = %llu)",
+ (longlong_t)scn->scn_zios_this_txg,
+ (longlong_t)scn->scn_segs_this_txg,
+ (longlong_t)NSEC2MSEC(gethrtime() -
+ scn->scn_sync_start_time),
+ (longlong_t)scn->scn_avg_zio_size_this_txg,
+ (longlong_t)scn->scn_avg_seg_size_this_txg);
+ } else if (scn->scn_done_txg != 0 && scn->scn_done_txg <= tx->tx_txg) {
+ /* Finished with everything. Mark the scrub as complete */
+ zfs_dbgmsg("scan issuing complete txg %llu",
+ (longlong_t)tx->tx_txg);
+ ASSERT3U(scn->scn_done_txg, !=, 0);
+ ASSERT0(spa->spa_scrub_inflight);
+ ASSERT0(scn->scn_bytes_pending);
+ dsl_scan_done(scn, B_TRUE, tx);
+ sync_type = SYNC_MANDATORY;
}
- zfs_dbgmsg("restarting resilver txg=%llu", txg);
-}
-boolean_t
-dsl_scan_resilvering(dsl_pool_t *dp)
-{
- return (dp->dp_scan->scn_phys.scn_state == DSS_SCANNING &&
- dp->dp_scan->scn_phys.scn_func == POOL_SCAN_RESILVER);
+ dsl_scan_sync_state(scn, tx, sync_type);
}
-/*
- * scrub consumers
- */
-
static void
-count_block(zfs_all_blkstats_t *zab, const blkptr_t *bp)
+count_block(dsl_scan_t *scn, zfs_all_blkstats_t *zab, const blkptr_t *bp)
{
int i;
+ /* update the spa's stats on how many bytes we have issued */
+ for (i = 0; i < BP_GET_NDVAS(bp); i++) {
+ atomic_add_64(&scn->scn_dp->dp_spa->spa_scan_pass_issued,
+ DVA_GET_ASIZE(&bp->blk_dva[i]));
+ }
+
/*
* If we resume after a reboot, zab will be NULL; don't record
* incomplete stats in that case.
@@ -1912,6 +3398,8 @@
if (zab == NULL)
return;
+ mutex_enter(&zab->zab_lock);
+
for (i = 0; i < 4; i++) {
int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS;
int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL;
@@ -1946,24 +3434,96 @@
break;
}
}
+
+ mutex_exit(&zab->zab_lock);
}
static void
-dsl_scan_scrub_done(zio_t *zio)
+scan_io_queue_insert_impl(dsl_scan_io_queue_t *queue, scan_io_t *sio)
{
- spa_t *spa = zio->io_spa;
+ avl_index_t idx;
+ int64_t asize = sio->sio_asize;
+ dsl_scan_t *scn = queue->q_scn;
- abd_free(zio->io_abd);
+ ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
- mutex_enter(&spa->spa_scrub_lock);
- spa->spa_scrub_inflight--;
- cv_broadcast(&spa->spa_scrub_io_cv);
+ if (avl_find(&queue->q_sios_by_addr, sio, &idx) != NULL) {
+ /* block is already scheduled for reading */
+ atomic_add_64(&scn->scn_bytes_pending, -asize);
+ kmem_free(sio, sizeof (*sio));
+ return;
+ }
+ avl_insert(&queue->q_sios_by_addr, sio, idx);
+ range_tree_add(queue->q_exts_by_addr, sio->sio_offset, asize);
+}
- if (zio->io_error && (zio->io_error != ECKSUM ||
- !(zio->io_flags & ZIO_FLAG_SPECULATIVE))) {
- spa->spa_dsl_pool->dp_scan->scn_phys.scn_errors++;
+/*
+ * Given all the info we got from our metadata scanning process, we
+ * construct a scan_io_t and insert it into the scan sorting queue. The
+ * I/O must already be suitable for us to process. This is controlled
+ * by dsl_scan_enqueue().
+ */
+static void
+scan_io_queue_insert(dsl_scan_io_queue_t *queue, const blkptr_t *bp, int dva_i,
+ int zio_flags, const zbookmark_phys_t *zb)
+{
+ dsl_scan_t *scn = queue->q_scn;
+ scan_io_t *sio = kmem_zalloc(sizeof (*sio), KM_SLEEP);
+
+ ASSERT0(BP_IS_GANG(bp));
+ ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
+
+ bp2sio(bp, sio, dva_i);
+ sio->sio_flags = zio_flags;
+ sio->sio_zb = *zb;
+
+ /*
+ * Increment the bytes pending counter now so that we can't
+ * get an integer underflow in case the worker processes the
+ * zio before we get to incrementing this counter.
+ */
+ atomic_add_64(&scn->scn_bytes_pending, sio->sio_asize);
+
+ scan_io_queue_insert_impl(queue, sio);
+}
+
+/*
+ * Given a set of I/O parameters as discovered by the metadata traversal
+ * process, attempts to place the I/O into the sorted queues (if allowed),
+ * or immediately executes the I/O.
+ */
+static void
+dsl_scan_enqueue(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags,
+ const zbookmark_phys_t *zb)
+{
+ spa_t *spa = dp->dp_spa;
+
+ ASSERT(!BP_IS_EMBEDDED(bp));
+
+ /*
+ * Gang blocks are hard to issue sequentially, so we just issue them
+ * here immediately instead of queuing them.
+ */
+ if (!dp->dp_scan->scn_is_sorted || BP_IS_GANG(bp)) {
+ scan_exec_io(dp, bp, zio_flags, zb, NULL);
+ return;
}
- mutex_exit(&spa->spa_scrub_lock);
+ for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
+ dva_t dva;
+ vdev_t *vdev;
+
+ dva = bp->blk_dva[i];
+ vdev = vdev_lookup_top(spa, DVA_GET_VDEV(&dva));
+ ASSERT(vdev != NULL);
+
+ mutex_enter(&vdev->vdev_scan_io_queue_lock);
+ if (vdev->vdev_scan_io_queue == NULL)
+ vdev->vdev_scan_io_queue = scan_io_queue_create(vdev);
+ ASSERT(dp->dp_scan != NULL);
+ scan_io_queue_insert(vdev->vdev_scan_io_queue, bp,
+ i, zio_flags, zb);
+ mutex_exit(&vdev->vdev_scan_io_queue_lock);
+ }
}
static int
@@ -1971,137 +3531,390 @@
const blkptr_t *bp, const zbookmark_phys_t *zb)
{
dsl_scan_t *scn = dp->dp_scan;
- size_t size = BP_GET_PSIZE(bp);
spa_t *spa = dp->dp_spa;
uint64_t phys_birth = BP_PHYSICAL_BIRTH(bp);
+ size_t psize = BP_GET_PSIZE(bp);
boolean_t needs_io;
int zio_flags = ZIO_FLAG_SCAN_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL;
- unsigned int scan_delay = 0;
-
+ int d;
+
if (phys_birth <= scn->scn_phys.scn_min_txg ||
phys_birth >= scn->scn_phys.scn_max_txg)
return (0);
- count_block(dp->dp_blkstats, bp);
-
- if (BP_IS_EMBEDDED(bp))
+ if (BP_IS_EMBEDDED(bp)) {
+ count_block(scn, dp->dp_blkstats, bp);
return (0);
+ }
ASSERT(DSL_SCAN_IS_SCRUB_RESILVER(scn));
if (scn->scn_phys.scn_func == POOL_SCAN_SCRUB) {
zio_flags |= ZIO_FLAG_SCRUB;
needs_io = B_TRUE;
- scan_delay = zfs_scrub_delay;
} else {
ASSERT3U(scn->scn_phys.scn_func, ==, POOL_SCAN_RESILVER);
zio_flags |= ZIO_FLAG_RESILVER;
needs_io = B_FALSE;
- scan_delay = zfs_resilver_delay;
}
/* If it's an intent log block, failure is expected. */
if (zb->zb_level == ZB_ZIL_LEVEL)
zio_flags |= ZIO_FLAG_SPECULATIVE;
- for (int d = 0; d < BP_GET_NDVAS(bp); d++) {
- vdev_t *vd = vdev_lookup_top(spa,
- DVA_GET_VDEV(&bp->blk_dva[d]));
+ for (d = 0; d < BP_GET_NDVAS(bp); d++) {
+ const dva_t *dva = &bp->blk_dva[d];
/*
* Keep track of how much data we've examined so that
* zpool(1M) status can make useful progress reports.
*/
- scn->scn_phys.scn_examined += DVA_GET_ASIZE(&bp->blk_dva[d]);
- spa->spa_scan_pass_exam += DVA_GET_ASIZE(&bp->blk_dva[d]);
+ scn->scn_phys.scn_examined += DVA_GET_ASIZE(dva);
+ spa->spa_scan_pass_exam += DVA_GET_ASIZE(dva);
/* if it's a resilver, this may not be in the target range */
- if (!needs_io) {
- if (DVA_GET_GANG(&bp->blk_dva[d])) {
- /*
- * Gang members may be spread across multiple
- * vdevs, so the best estimate we have is the
- * scrub range, which has already been checked.
- * XXX -- it would be better to change our
- * allocation policy to ensure that all
- * gang members reside on the same vdev.
- */
- needs_io = B_TRUE;
- } else {
- needs_io = vdev_dtl_contains(vd, DTL_PARTIAL,
- phys_birth, 1);
- }
- }
+ if (!needs_io)
+ needs_io = dsl_scan_need_resilver(spa, dva, psize,
+ phys_birth);
}
if (needs_io && !zfs_no_scrub_io) {
- vdev_t *rvd = spa->spa_root_vdev;
- uint64_t maxinflight = rvd->vdev_children *
- MAX(zfs_top_maxinflight, 1);
+ dsl_scan_enqueue(dp, bp, zio_flags, zb);
+ } else {
+ count_block(scn, dp->dp_blkstats, bp);
+ }
+ /* do not relocate this block */
+ return (0);
+}
+
+static void
+dsl_scan_scrub_done(zio_t *zio)
+{
+ spa_t *spa = zio->io_spa;
+ blkptr_t *bp = zio->io_bp;
+ dsl_scan_io_queue_t *queue = zio->io_private;
+
+ abd_free(zio->io_abd);
+
+ if (queue == NULL) {
mutex_enter(&spa->spa_scrub_lock);
- while (spa->spa_scrub_inflight >= maxinflight)
+ ASSERT3U(spa->spa_scrub_inflight, >=, BP_GET_PSIZE(bp));
+ spa->spa_scrub_inflight -= BP_GET_PSIZE(bp);
+ cv_broadcast(&spa->spa_scrub_io_cv);
+ mutex_exit(&spa->spa_scrub_lock);
+ } else {
+ mutex_enter(&queue->q_vd->vdev_scan_io_queue_lock);
+ ASSERT3U(queue->q_inflight_bytes, >=, BP_GET_PSIZE(bp));
+ queue->q_inflight_bytes -= BP_GET_PSIZE(bp);
+ cv_broadcast(&queue->q_zio_cv);
+ mutex_exit(&queue->q_vd->vdev_scan_io_queue_lock);
+ }
+
+ if (zio->io_error && (zio->io_error != ECKSUM ||
+ !(zio->io_flags & ZIO_FLAG_SPECULATIVE))) {
+ atomic_inc_64(&spa->spa_dsl_pool->dp_scan->scn_phys.scn_errors);
+ }
+}
+
+/*
+ * Given a scanning zio's information, executes the zio. The zio need
+ * not necessarily be only sortable, this function simply executes the
+ * zio, no matter what it is. The optional queue argument allows the
+ * caller to specify that they want per top level vdev IO rate limiting
+ * instead of the legacy global limiting.
+ */
+static void
+scan_exec_io(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags,
+ const zbookmark_phys_t *zb, dsl_scan_io_queue_t *queue)
+{
+ spa_t *spa = dp->dp_spa;
+ dsl_scan_t *scn = dp->dp_scan;
+ size_t size = BP_GET_PSIZE(bp);
+ abd_t *data = abd_alloc_for_io(size, B_FALSE);
+ unsigned int scan_delay = 0;
+
+ if (queue == NULL) {
+ mutex_enter(&spa->spa_scrub_lock);
+ while (spa->spa_scrub_inflight >= scn->scn_maxinflight_bytes)
cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
- spa->spa_scrub_inflight++;
+ spa->spa_scrub_inflight += BP_GET_PSIZE(bp);
mutex_exit(&spa->spa_scrub_lock);
+ } else {
+ kmutex_t *q_lock = &queue->q_vd->vdev_scan_io_queue_lock;
- /*
- * If we're seeing recent (zfs_scan_idle) "important" I/Os
- * then throttle our workload to limit the impact of a scan.
- */
- if (ddi_get_lbolt64() - spa->spa_last_io <= zfs_scan_idle)
- delay(MAX((int)scan_delay, 0));
+ mutex_enter(q_lock);
+ while (queue->q_inflight_bytes >= queue->q_maxinflight_bytes)
+ cv_wait(&queue->q_zio_cv, q_lock);
+ queue->q_inflight_bytes += BP_GET_PSIZE(bp);
+ mutex_exit(q_lock);
+ }
- zio_nowait(zio_read(NULL, spa, bp,
- abd_alloc_for_io(size, B_FALSE), size, dsl_scan_scrub_done,
- NULL, ZIO_PRIORITY_SCRUB, zio_flags, zb));
+ if (zio_flags & ZIO_FLAG_RESILVER)
+ scan_delay = zfs_resilver_delay;
+ else {
+ ASSERT(zio_flags & ZIO_FLAG_SCRUB);
+ scan_delay = zfs_scrub_delay;
}
- /* do not relocate this block */
- return (0);
+ if (scan_delay && (ddi_get_lbolt64() - spa->spa_last_io <= zfs_scan_idle))
+ delay(MAX((int)scan_delay, 0));
+
+ count_block(dp->dp_scan, dp->dp_blkstats, bp);
+ zio_nowait(zio_read(dp->dp_scan->scn_zio_root, spa, bp, data, size,
+ dsl_scan_scrub_done, queue, ZIO_PRIORITY_SCRUB, zio_flags, zb));
}
/*
- * Called by the ZFS_IOC_POOL_SCAN ioctl to start a scrub or resilver.
- * Can also be called to resume a paused scrub.
+ * This is the primary extent sorting algorithm. We balance two parameters:
+ * 1) how many bytes of I/O are in an extent
+ * 2) how well the extent is filled with I/O (as a fraction of its total size)
+ * Since we allow extents to have gaps between their constituent I/Os, it's
+ * possible to have a fairly large extent that contains the same amount of
+ * I/O bytes than a much smaller extent, which just packs the I/O more tightly.
+ * The algorithm sorts based on a score calculated from the extent's size,
+ * the relative fill volume (in %) and a "fill weight" parameter that controls
+ * the split between whether we prefer larger extents or more well populated
+ * extents:
+ *
+ * SCORE = FILL_IN_BYTES + (FILL_IN_PERCENT * FILL_IN_BYTES * FILL_WEIGHT)
+ *
+ * Example:
+ * 1) assume extsz = 64 MiB
+ * 2) assume fill = 32 MiB (extent is half full)
+ * 3) assume fill_weight = 3
+ * 4) SCORE = 32M + (((32M * 100) / 64M) * 3 * 32M) / 100
+ * SCORE = 32M + (50 * 3 * 32M) / 100
+ * SCORE = 32M + (4800M / 100)
+ * SCORE = 32M + 48M
+ * ^ ^
+ * | +--- final total relative fill-based score
+ * +--------- final total fill-based score
+ * SCORE = 80M
+ *
+ * As can be seen, at fill_ratio=3, the algorithm is slightly biased towards
+ * extents that are more completely filled (in a 3:2 ratio) vs just larger.
+ * Note that as an optimization, we replace multiplication and division by
+ * 100 with bitshifting by 7 (which effecitvely multiplies and divides by 128).
*/
-int
-dsl_scan(dsl_pool_t *dp, pool_scan_func_t func)
+static int
+ext_size_compare(const void *x, const void *y)
{
- spa_t *spa = dp->dp_spa;
+ const range_seg_t *rsa = x, *rsb = y;
+ uint64_t sa = rsa->rs_end - rsa->rs_start,
+ sb = rsb->rs_end - rsb->rs_start;
+ uint64_t score_a, score_b;
+
+ score_a = rsa->rs_fill + ((((rsa->rs_fill << 7) / sa) *
+ fill_weight * rsa->rs_fill) >> 7);
+ score_b = rsb->rs_fill + ((((rsb->rs_fill << 7) / sb) *
+ fill_weight * rsb->rs_fill) >> 7);
+
+ if (score_a > score_b)
+ return (-1);
+ if (score_a == score_b) {
+ if (rsa->rs_start < rsb->rs_start)
+ return (-1);
+ if (rsa->rs_start == rsb->rs_start)
+ return (0);
+ return (1);
+ }
+ return (1);
+}
+
+/*
+ * Comparator for the q_sios_by_addr tree. Sorting is simply performed
+ * based on LBA-order (from lowest to highest).
+ */
+static int
+io_addr_compare(const void *x, const void *y)
+{
+ const scan_io_t *a = x, *b = y;
+
+ if (a->sio_offset < b->sio_offset)
+ return (-1);
+ if (a->sio_offset == b->sio_offset)
+ return (0);
+ return (1);
+}
+
+/* IO queues are created on demand when they are needed. */
+static dsl_scan_io_queue_t *
+scan_io_queue_create(vdev_t *vd)
+{
+ dsl_scan_t *scn = vd->vdev_spa->spa_dsl_pool->dp_scan;
+ dsl_scan_io_queue_t *q = kmem_zalloc(sizeof (*q), KM_SLEEP);
+
+ q->q_scn = scn;
+ q->q_vd = vd;
+ cv_init(&q->q_zio_cv, NULL, CV_DEFAULT, NULL);
+ q->q_exts_by_addr = range_tree_create_impl(&rt_avl_ops,
+ &q->q_exts_by_size, ext_size_compare, zfs_scan_max_ext_gap);
+ avl_create(&q->q_sios_by_addr, io_addr_compare,
+ sizeof (scan_io_t), offsetof(scan_io_t, sio_nodes.sio_addr_node));
+
+ return (q);
+}
+
+/*
+ * Destroys a scan queue and all segments and scan_io_t's contained in it.
+ * No further execution of I/O occurs, anything pending in the queue is
+ * simply freed without being executed.
+ */
+void
+dsl_scan_io_queue_destroy(dsl_scan_io_queue_t *queue)
+{
+ dsl_scan_t *scn = queue->q_scn;
+ scan_io_t *sio;
+ void *cookie = NULL;
+ int64_t bytes_dequeued = 0;
+
+ ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
+
+ while ((sio = avl_destroy_nodes(&queue->q_sios_by_addr, &cookie)) !=
+ NULL) {
+ ASSERT(range_tree_contains(queue->q_exts_by_addr,
+ sio->sio_offset, sio->sio_asize));
+ bytes_dequeued += sio->sio_asize;
+ kmem_free(sio, sizeof (*sio));
+ }
+
+ atomic_add_64(&scn->scn_bytes_pending, -bytes_dequeued);
+ range_tree_vacate(queue->q_exts_by_addr, NULL, queue);
+ range_tree_destroy(queue->q_exts_by_addr);
+ avl_destroy(&queue->q_sios_by_addr);
+ cv_destroy(&queue->q_zio_cv);
+
+ kmem_free(queue, sizeof (*queue));
+}
+
+/*
+ * Properly transfers a dsl_scan_queue_t from `svd' to `tvd'. This is
+ * called on behalf of vdev_top_transfer when creating or destroying
+ * a mirror vdev due to zpool attach/detach.
+ */
+void
+dsl_scan_io_queue_vdev_xfer(vdev_t *svd, vdev_t *tvd)
+{
+ mutex_enter(&svd->vdev_scan_io_queue_lock);
+ mutex_enter(&tvd->vdev_scan_io_queue_lock);
+
+ VERIFY3P(tvd->vdev_scan_io_queue, ==, NULL);
+ tvd->vdev_scan_io_queue = svd->vdev_scan_io_queue;
+ svd->vdev_scan_io_queue = NULL;
+ if (tvd->vdev_scan_io_queue != NULL)
+ tvd->vdev_scan_io_queue->q_vd = tvd;
+
+ mutex_exit(&tvd->vdev_scan_io_queue_lock);
+ mutex_exit(&svd->vdev_scan_io_queue_lock);
+}
+
+static void
+scan_io_queues_destroy(dsl_scan_t *scn)
+{
+ vdev_t *rvd = scn->scn_dp->dp_spa->spa_root_vdev;
+
+ for (uint64_t i = 0; i < rvd->vdev_children; i++) {
+ vdev_t *tvd = rvd->vdev_child[i];
+
+ mutex_enter(&tvd->vdev_scan_io_queue_lock);
+ if (tvd->vdev_scan_io_queue != NULL)
+ dsl_scan_io_queue_destroy(tvd->vdev_scan_io_queue);
+ tvd->vdev_scan_io_queue = NULL;
+ mutex_exit(&tvd->vdev_scan_io_queue_lock);
+ }
+}
+
+static void
+dsl_scan_freed_dva(spa_t *spa, const blkptr_t *bp, int dva_i)
+{
+ dsl_pool_t *dp = spa->spa_dsl_pool;
dsl_scan_t *scn = dp->dp_scan;
+ vdev_t *vdev;
+ kmutex_t *q_lock;
+ dsl_scan_io_queue_t *queue;
+ scan_io_t srch, *sio;
+ avl_index_t idx;
+ uint64_t start, size;
+ vdev = vdev_lookup_top(spa, DVA_GET_VDEV(&bp->blk_dva[dva_i]));
+ ASSERT(vdev != NULL);
+ q_lock = &vdev->vdev_scan_io_queue_lock;
+ queue = vdev->vdev_scan_io_queue;
+
+ mutex_enter(q_lock);
+ if (queue == NULL) {
+ mutex_exit(q_lock);
+ return;
+ }
+
+ bp2sio(bp, &srch, dva_i);
+ start = srch.sio_offset;
+ size = srch.sio_asize;
+
/*
- * Purge all vdev caches and probe all devices. We do this here
- * rather than in sync context because this requires a writer lock
- * on the spa_config lock, which we can't do from sync context. The
- * spa_scrub_reopen flag indicates that vdev_open() should not
- * attempt to start another scrub.
+ * We can find the zio in two states:
+ * 1) Cold, just sitting in the queue of zio's to be issued at
+ * some point in the future. In this case, all we do is
+ * remove the zio from the q_sios_by_addr tree, decrement
+ * its data volume from the containing range_seg_t and
+ * resort the q_exts_by_size tree to reflect that the
+ * range_seg_t has lost some of its 'fill'. We don't shorten
+ * the range_seg_t - this is usually rare enough not to be
+ * worth the extra hassle of trying keep track of precise
+ * extent boundaries.
+ * 2) Hot, where the zio is currently in-flight in
+ * dsl_scan_issue_ios. In this case, we can't simply
+ * reach in and stop the in-flight zio's, so we instead
+ * block the caller. Eventually, dsl_scan_issue_ios will
+ * be done with issuing the zio's it gathered and will
+ * signal us.
*/
- spa_vdev_state_enter(spa, SCL_NONE);
- spa->spa_scrub_reopen = B_TRUE;
- vdev_reopen(spa->spa_root_vdev);
- spa->spa_scrub_reopen = B_FALSE;
- (void) spa_vdev_state_exit(spa, NULL, 0);
+ sio = avl_find(&queue->q_sios_by_addr, &srch, &idx);
+ if (sio != NULL) {
+ int64_t asize = sio->sio_asize;
+ blkptr_t tmpbp;
- if (func == POOL_SCAN_SCRUB && dsl_scan_is_paused_scrub(scn)) {
- /* got scrub start cmd, resume paused scrub */
- int err = dsl_scrub_set_pause_resume(scn->scn_dp,
- POOL_SCRUB_NORMAL);
- if (err == 0) {
- spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_RESUME);
- return (ECANCELED);
- }
+ /* Got it while it was cold in the queue */
+ ASSERT3U(start, ==, sio->sio_offset);
+ ASSERT3U(size, ==, asize);
+ avl_remove(&queue->q_sios_by_addr, sio);
- return (SET_ERROR(err));
- }
+ ASSERT(range_tree_contains(queue->q_exts_by_addr, start, size));
+ range_tree_remove_fill(queue->q_exts_by_addr, start, size);
- return (dsl_sync_task(spa_name(spa), dsl_scan_setup_check,
- dsl_scan_setup_sync, &func, 0, ZFS_SPACE_CHECK_EXTRA_RESERVED));
+ /*
+ * We only update scn_bytes_pending in the cold path,
+ * otherwise it will already have been accounted for as
+ * part of the zio's execution.
+ */
+ atomic_add_64(&scn->scn_bytes_pending, -asize);
+
+ /* count the block as though we issued it */
+ sio2bp(sio, &tmpbp, dva_i);
+ count_block(scn, dp->dp_blkstats, &tmpbp);
+
+ kmem_free(sio, sizeof (*sio));
+ }
+ mutex_exit(q_lock);
}
-static boolean_t
-dsl_scan_restarting(dsl_scan_t *scn, dmu_tx_t *tx)
+/*
+ * Callback invoked when a zio_free() zio is executing. This needs to be
+ * intercepted to prevent the zio from deallocating a particular portion
+ * of disk space and it then getting reallocated and written to, while we
+ * still have it queued up for processing.
+ */
+void
+dsl_scan_freed(spa_t *spa, const blkptr_t *bp)
{
- return (scn->scn_restart_txg != 0 &&
- scn->scn_restart_txg <= tx->tx_txg);
+ dsl_pool_t *dp = spa->spa_dsl_pool;
+ dsl_scan_t *scn = dp->dp_scan;
+
+ ASSERT(!BP_IS_EMBEDDED(bp));
+ ASSERT(scn != NULL);
+ if (!dsl_scan_is_running(scn))
+ return;
+
+ for (int i = 0; i < BP_GET_NDVAS(bp); i++)
+ dsl_scan_freed_dva(spa, bp, i);
}
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
@@ -1120,85 +1120,6 @@
}
/*
- * Create any block allocator specific components. The current allocators
- * rely on using both a size-ordered range_tree_t and an array of uint64_t's.
- */
-static void
-metaslab_rt_create(range_tree_t *rt, void *arg)
-{
- metaslab_t *msp = arg;
-
- ASSERT3P(rt->rt_arg, ==, msp);
- ASSERT(msp->ms_allocatable == NULL);
-
- avl_create(&msp->ms_allocatable_by_size, metaslab_rangesize_compare,
- sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node));
-}
-
-/*
- * Destroy the block allocator specific components.
- */
-static void
-metaslab_rt_destroy(range_tree_t *rt, void *arg)
-{
- metaslab_t *msp = arg;
-
- ASSERT3P(rt->rt_arg, ==, msp);
- ASSERT3P(msp->ms_allocatable, ==, rt);
- ASSERT0(avl_numnodes(&msp->ms_allocatable_by_size));
-
- avl_destroy(&msp->ms_allocatable_by_size);
-}
-
-static void
-metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg)
-{
- metaslab_t *msp = arg;
-
- ASSERT3P(rt->rt_arg, ==, msp);
- ASSERT3P(msp->ms_allocatable, ==, rt);
- VERIFY(!msp->ms_condensing);
- avl_add(&msp->ms_allocatable_by_size, rs);
-}
-
-static void
-metaslab_rt_remove(range_tree_t *rt, range_seg_t *rs, void *arg)
-{
- metaslab_t *msp = arg;
-
- ASSERT3P(rt->rt_arg, ==, msp);
- ASSERT3P(msp->ms_allocatable, ==, rt);
- VERIFY(!msp->ms_condensing);
- avl_remove(&msp->ms_allocatable_by_size, rs);
-}
-
-static void
-metaslab_rt_vacate(range_tree_t *rt, void *arg)
-{
- metaslab_t *msp = arg;
-
- ASSERT3P(rt->rt_arg, ==, msp);
- ASSERT3P(msp->ms_allocatable, ==, rt);
-
- /*
- * Normally one would walk the tree freeing nodes along the way.
- * Since the nodes are shared with the range trees we can avoid
- * walking all nodes and just reinitialize the avl tree. The nodes
- * will be freed by the range tree, so we don't want to free them here.
- */
- avl_create(&msp->ms_allocatable_by_size, metaslab_rangesize_compare,
- sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node));
-}
-
-static range_tree_ops_t metaslab_rt_ops = {
- metaslab_rt_create,
- metaslab_rt_destroy,
- metaslab_rt_add,
- metaslab_rt_remove,
- metaslab_rt_vacate
-};
-
-/*
* ==========================================================================
* Common allocator routines
* ==========================================================================
@@ -1574,7 +1495,8 @@
* addition of new space; and for debugging, it ensures that we'd
* data fault on any attempt to use this metaslab before it's ready.
*/
- ms->ms_allocatable = range_tree_create(&metaslab_rt_ops, ms);
+ ms->ms_allocatable = range_tree_create_impl(&rt_avl_ops, &ms->ms_allocatable_by_size,
+ metaslab_rangesize_compare, 0);
metaslab_group_add(mg, ms);
metaslab_set_fragmentation(ms);
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/range_tree.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/range_tree.c
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/range_tree.c
@@ -33,8 +33,58 @@
#include <sys/zio.h>
#include <sys/range_tree.h>
+/*
+ * Range trees are tree-based data structures that can be used to
+ * track free space or generally any space allocation information.
+ * A range tree keeps track of individual segments and automatically
+ * provides facilities such as adjacent extent merging and extent
+ * splitting in response to range add/remove requests.
+ *
+ * A range tree starts out completely empty, with no segments in it.
+ * Adding an allocation via range_tree_add to the range tree can either:
+ * 1) create a new extent
+ * 2) extend an adjacent extent
+ * 3) merge two adjacent extents
+ * Conversely, removing an allocation via range_tree_remove can:
+ * 1) completely remove an extent
+ * 2) shorten an extent (if the allocation was near one of its ends)
+ * 3) split an extent into two extents, in effect punching a hole
+ *
+ * A range tree is also capable of 'bridging' gaps when adding
+ * allocations. This is useful for cases when close proximity of
+ * allocations is an important detail that needs to be represented
+ * in the range tree. See range_tree_set_gap(). The default behavior
+ * is not to bridge gaps (i.e. the maximum allowed gap size is 0).
+ *
+ * In order to traverse a range tree, use either the range_tree_walk()
+ * or range_tree_vacate() functions.
+ *
+ * To obtain more accurate information on individual segment
+ * operations that the range tree performs "under the hood", you can
+ * specify a set of callbacks by passing a range_tree_ops_t structure
+ * to the range_tree_create function. Any callbacks that are non-NULL
+ * are then called at the appropriate times.
+ *
+ * The range tree code also supports a special variant of range trees
+ * that can bridge small gaps between segments. This kind of tree is used
+ * by the dsl scanning code to group I/Os into mostly sequential chunks to
+ * optimize disk performance. The code here attempts to do this with as
+ * little memory and computational overhead as possible. One limitation of
+ * this implementation is that segments of range trees with gaps can only
+ * support removing complete segments.
+ */
+
kmem_cache_t *range_seg_cache;
+/* Generic ops for managing an AVL tree alongside a range tree */
+struct range_tree_ops rt_avl_ops = {
+ .rtop_create = rt_avl_create,
+ .rtop_destroy = rt_avl_destroy,
+ .rtop_add = rt_avl_add,
+ .rtop_remove = rt_avl_remove,
+ .rtop_vacate = rt_avl_vacate,
+};
+
void
range_tree_init(void)
{
@@ -109,47 +159,47 @@
static int
range_tree_seg_compare(const void *x1, const void *x2)
{
- const range_seg_t *r1 = x1;
- const range_seg_t *r2 = x2;
+ const range_seg_t *r1 = (const range_seg_t *)x1;
+ const range_seg_t *r2 = (const range_seg_t *)x2;
- if (r1->rs_start < r2->rs_start) {
- if (r1->rs_end > r2->rs_start)
- return (0);
- return (-1);
- }
- if (r1->rs_start > r2->rs_start) {
- if (r1->rs_start < r2->rs_end)
- return (0);
- return (1);
- }
- return (0);
+ ASSERT3U(r1->rs_start, <=, r1->rs_end);
+ ASSERT3U(r2->rs_start, <=, r2->rs_end);
+
+ return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start));
}
range_tree_t *
-range_tree_create(range_tree_ops_t *ops, void *arg)
+range_tree_create_impl(range_tree_ops_t *ops, void *arg,
+ int (*avl_compare) (const void *, const void *), uint64_t gap)
{
- range_tree_t *rt;
+ range_tree_t *rt = kmem_zalloc(sizeof (range_tree_t), KM_SLEEP);
- rt = kmem_zalloc(sizeof (range_tree_t), KM_SLEEP);
-
avl_create(&rt->rt_root, range_tree_seg_compare,
sizeof (range_seg_t), offsetof(range_seg_t, rs_node));
rt->rt_ops = ops;
rt->rt_arg = arg;
+ rt->rt_gap = gap;
+ rt->rt_avl_compare = avl_compare;
- if (rt->rt_ops != NULL)
+ if (rt->rt_ops != NULL && rt->rt_ops->rtop_create != NULL)
rt->rt_ops->rtop_create(rt, rt->rt_arg);
return (rt);
}
+range_tree_t *
+range_tree_create(range_tree_ops_t *ops, void *arg)
+{
+ return (range_tree_create_impl(ops, arg, NULL, 0));
+}
+
void
range_tree_destroy(range_tree_t *rt)
{
VERIFY0(rt->rt_space);
- if (rt->rt_ops != NULL)
+ if (rt->rt_ops != NULL && rt->rt_ops->rtop_destroy != NULL)
rt->rt_ops->rtop_destroy(rt, rt->rt_arg);
avl_destroy(&rt->rt_root);
@@ -157,39 +207,99 @@
}
void
-range_tree_add(void *arg, uint64_t start, uint64_t size)
+range_tree_adjust_fill(range_tree_t *rt, range_seg_t *rs, int64_t delta)
{
+ ASSERT3U(rs->rs_fill + delta, !=, 0);
+ ASSERT3U(rs->rs_fill + delta, <=, rs->rs_end - rs->rs_start);
+
+ if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
+ rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
+ rs->rs_fill += delta;
+ if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
+ rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
+}
+
+static void
+range_tree_add_impl(void *arg, uint64_t start, uint64_t size, uint64_t fill)
+{
range_tree_t *rt = arg;
avl_index_t where;
range_seg_t rsearch, *rs_before, *rs_after, *rs;
- uint64_t end = start + size;
+ uint64_t end = start + size, gap = rt->rt_gap;
+ uint64_t bridge_size = 0;
boolean_t merge_before, merge_after;
- VERIFY(size != 0);
+ ASSERT3U(size, !=, 0);
+ ASSERT3U(fill, <=, size);
rsearch.rs_start = start;
rsearch.rs_end = end;
rs = avl_find(&rt->rt_root, &rsearch, &where);
- if (rs != NULL && rs->rs_start <= start && rs->rs_end >= end) {
+ if (gap == 0 && rs != NULL &&
+ rs->rs_start <= start && rs->rs_end >= end) {
zfs_panic_recover("zfs: allocating allocated segment"
- "(offset=%llu size=%llu)\n",
- (longlong_t)start, (longlong_t)size);
+ "(offset=%llu size=%llu) of (offset=%llu size=%llu)\n",
+ (longlong_t)start, (longlong_t)size,
+ (longlong_t)rs->rs_start,
+ (longlong_t)rs->rs_end - rs->rs_start);
return;
}
- /* Make sure we don't overlap with either of our neighbors */
- VERIFY(rs == NULL);
+ /*
+ * If this is a gap-supporting range tree, it is possible that we
+ * are inserting into an existing segment. In this case simply
+ * bump the fill count and call the remove / add callbacks. If the
+ * new range will extend an existing segment, we remove the
+ * existing one, apply the new extent to it and re-insert it using
+ * the normal code paths.
+ */
+ if (rs != NULL) {
+ ASSERT3U(gap, !=, 0);
+ if (rs->rs_start <= start && rs->rs_end >= end) {
+ range_tree_adjust_fill(rt, rs, fill);
+ return;
+ }
+ avl_remove(&rt->rt_root, rs);
+ if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
+ rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
+
+ range_tree_stat_decr(rt, rs);
+ rt->rt_space -= rs->rs_end - rs->rs_start;
+
+ fill += rs->rs_fill;
+ start = MIN(start, rs->rs_start);
+ end = MAX(end, rs->rs_end);
+ size = end - start;
+
+ range_tree_add_impl(rt, start, size, fill);
+
+ kmem_cache_free(range_seg_cache, rs);
+ return;
+ }
+
+ ASSERT3P(rs, ==, NULL);
+
+ /*
+ * Determine whether or not we will have to merge with our neighbors.
+ * If gap != 0, we might need to merge with our neighbors even if we
+ * aren't directly touching.
+ */
rs_before = avl_nearest(&rt->rt_root, where, AVL_BEFORE);
rs_after = avl_nearest(&rt->rt_root, where, AVL_AFTER);
- merge_before = (rs_before != NULL && rs_before->rs_end == start);
- merge_after = (rs_after != NULL && rs_after->rs_start == end);
+ merge_before = (rs_before != NULL && rs_before->rs_end >= start - gap);
+ merge_after = (rs_after != NULL && rs_after->rs_start <= end + gap);
+ if (merge_before && gap != 0)
+ bridge_size += start - rs_before->rs_end;
+ if (merge_after && gap != 0)
+ bridge_size += rs_after->rs_start - end;
+
if (merge_before && merge_after) {
avl_remove(&rt->rt_root, rs_before);
- if (rt->rt_ops != NULL) {
+ if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) {
rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg);
rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg);
}
@@ -197,43 +307,59 @@
range_tree_stat_decr(rt, rs_before);
range_tree_stat_decr(rt, rs_after);
+ rs_after->rs_fill += rs_before->rs_fill + fill;
rs_after->rs_start = rs_before->rs_start;
kmem_cache_free(range_seg_cache, rs_before);
rs = rs_after;
} else if (merge_before) {
- if (rt->rt_ops != NULL)
+ if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg);
range_tree_stat_decr(rt, rs_before);
+ rs_before->rs_fill += fill;
rs_before->rs_end = end;
rs = rs_before;
} else if (merge_after) {
- if (rt->rt_ops != NULL)
+ if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg);
range_tree_stat_decr(rt, rs_after);
+ rs_after->rs_fill += fill;
rs_after->rs_start = start;
rs = rs_after;
} else {
rs = kmem_cache_alloc(range_seg_cache, KM_SLEEP);
+
+ rs->rs_fill = fill;
rs->rs_start = start;
rs->rs_end = end;
avl_insert(&rt->rt_root, rs, where);
}
- if (rt->rt_ops != NULL)
+ if (gap != 0)
+ ASSERT3U(rs->rs_fill, <=, rs->rs_end - rs->rs_start);
+ else
+ ASSERT3U(rs->rs_fill, ==, rs->rs_end - rs->rs_start);
+
+ if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
range_tree_stat_incr(rt, rs);
- rt->rt_space += size;
+ rt->rt_space += size + bridge_size;
}
void
-range_tree_remove(void *arg, uint64_t start, uint64_t size)
+range_tree_add(void *arg, uint64_t start, uint64_t size)
{
- range_tree_t *rt = arg;
+ range_tree_add_impl(arg, start, size, size);
+}
+
+static void
+range_tree_remove_impl(range_tree_t *rt, uint64_t start, uint64_t size,
+ boolean_t do_fill)
+{
avl_index_t where;
range_seg_t rsearch, *rs, *newseg;
uint64_t end = start + size;
@@ -253,6 +379,34 @@
(longlong_t)start, (longlong_t)size);
return;
}
+
+ /*
+ * Range trees with gap support must only remove complete segments
+ * from the tree. This allows us to maintain accurate fill accounting
+ * and to ensure that bridged sections are not leaked. If we need to
+ * remove less than the full segment, we can only adjust the fill count.
+ */
+ if (rt->rt_gap != 0) {
+ if (do_fill) {
+ if (rs->rs_fill == size) {
+ start = rs->rs_start;
+ end = rs->rs_end;
+ size = end - start;
+ } else {
+ range_tree_adjust_fill(rt, rs, -size);
+ return;
+ }
+ } else if (rs->rs_start != start || rs->rs_end != end) {
+ zfs_panic_recover("zfs: freeing partial segment of "
+ "gap tree (offset=%llu size=%llu) of "
+ "(offset=%llu size=%llu)",
+ (longlong_t)start, (longlong_t)size,
+ (longlong_t)rs->rs_start,
+ (longlong_t)rs->rs_end - rs->rs_start);
+ return;
+ }
+ }
+
VERIFY3U(rs->rs_start, <=, start);
VERIFY3U(rs->rs_end, >=, end);
@@ -261,19 +415,20 @@
range_tree_stat_decr(rt, rs);
- if (rt->rt_ops != NULL)
+ if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
if (left_over && right_over) {
newseg = kmem_cache_alloc(range_seg_cache, KM_SLEEP);
newseg->rs_start = end;
newseg->rs_end = rs->rs_end;
+ newseg->rs_fill = newseg->rs_end - newseg->rs_start;
range_tree_stat_incr(rt, newseg);
rs->rs_end = start;
avl_insert_here(&rt->rt_root, newseg, rs, AVL_AFTER);
- if (rt->rt_ops != NULL)
+ if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
rt->rt_ops->rtop_add(rt, newseg, rt->rt_arg);
} else if (left_over) {
rs->rs_end = start;
@@ -286,15 +441,53 @@
}
if (rs != NULL) {
+ /*
+ * The fill of the leftover segment will always be equal to
+ * the size, since we do not support removing partial segments
+ * of range trees with gaps.
+ */
+ rs->rs_fill = rs->rs_end - rs->rs_start;
range_tree_stat_incr(rt, rs);
- if (rt->rt_ops != NULL)
+ if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
}
rt->rt_space -= size;
}
+void
+range_tree_remove(void *arg, uint64_t start, uint64_t size)
+{
+ range_tree_remove_impl(arg, start, size, B_FALSE);
+}
+
+void
+range_tree_remove_fill(range_tree_t *rt, uint64_t start, uint64_t size)
+{
+ range_tree_remove_impl(rt, start, size, B_TRUE);
+}
+
+void
+range_tree_resize_segment(range_tree_t *rt, range_seg_t *rs,
+ uint64_t newstart, uint64_t newsize)
+{
+ int64_t delta = newsize - (rs->rs_end - rs->rs_start);
+
+ range_tree_stat_decr(rt, rs);
+ if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
+ rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
+
+ rs->rs_start = newstart;
+ rs->rs_end = newstart + newsize;
+
+ range_tree_stat_incr(rt, rs);
+ if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
+ rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
+
+ rt->rt_space += delta;
+}
+
static range_seg_t *
range_tree_find_impl(range_tree_t *rt, uint64_t start, uint64_t size)
{
@@ -309,7 +502,7 @@
return (avl_find(&rt->rt_root, &rsearch, &where));
}
-static range_seg_t *
+range_seg_t *
range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size)
{
range_seg_t *rs = range_tree_find_impl(rt, start, size);
@@ -373,7 +566,7 @@
void *cookie = NULL;
- if (rt->rt_ops != NULL)
+ if (rt->rt_ops != NULL && rt->rt_ops->rtop_vacate != NULL)
rt->rt_ops->rtop_vacate(rt, rt->rt_arg);
while ((rs = avl_destroy_nodes(&rt->rt_root, &cookie)) != NULL) {
@@ -395,10 +588,61 @@
func(arg, rs->rs_start, rs->rs_end - rs->rs_start);
}
+range_seg_t *
+range_tree_first(range_tree_t *rt)
+{
+ return (avl_first(&rt->rt_root));
+}
+
uint64_t
range_tree_space(range_tree_t *rt)
{
return (rt->rt_space);
+}
+
+/* Generic range tree functions for maintaining segments in an AVL tree. */
+void
+rt_avl_create(range_tree_t *rt, void *arg)
+{
+ avl_tree_t *tree = arg;
+
+ avl_create(tree, rt->rt_avl_compare, sizeof (range_seg_t),
+ offsetof(range_seg_t, rs_pp_node));
+}
+
+void
+rt_avl_destroy(range_tree_t *rt, void *arg)
+{
+ avl_tree_t *tree = arg;
+
+ ASSERT0(avl_numnodes(tree));
+ avl_destroy(tree);
+}
+
+void
+rt_avl_add(range_tree_t *rt, range_seg_t *rs, void *arg)
+{
+ avl_tree_t *tree = arg;
+ avl_add(tree, rs);
+}
+
+void
+rt_avl_remove(range_tree_t *rt, range_seg_t *rs, void *arg)
+{
+ avl_tree_t *tree = arg;
+ avl_remove(tree, rs);
+}
+
+void
+rt_avl_vacate(range_tree_t *rt, void *arg)
+{
+ /*
+ * Normally one would walk the tree freeing nodes along the way.
+ * Since the nodes are shared with the range trees we can avoid
+ * walking all nodes and just reinitialize the avl tree. The nodes
+ * will be freed by the range tree, so we don't want to free them here.
+ */
+ rt_avl_create(rt, arg);
}
boolean_t
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
@@ -2035,7 +2035,7 @@
}
mutex_enter(&spa->spa_scrub_lock);
- spa->spa_scrub_inflight--;
+ spa->spa_load_verify_ios--;
cv_broadcast(&spa->spa_scrub_io_cv);
mutex_exit(&spa->spa_scrub_lock);
}
@@ -2082,9 +2082,9 @@
size_t size = BP_GET_PSIZE(bp);
mutex_enter(&spa->spa_scrub_lock);
- while (spa->spa_scrub_inflight >= spa_load_verify_maxinflight)
+ while (spa->spa_load_verify_ios >= spa_load_verify_maxinflight)
cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
- spa->spa_scrub_inflight++;
+ spa->spa_load_verify_ios++;
mutex_exit(&spa->spa_scrub_lock);
zio_nowait(zio_read(rio, spa, bp, abd_alloc_for_io(size, B_FALSE), size,
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c
@@ -2095,6 +2095,8 @@
zpool_feature_init();
spa_config_load();
l2arc_start();
+ scan_init();
+ dsl_scan_global_init();
#ifndef illumos
#ifdef _KERNEL
zfs_deadman_init();
@@ -2119,7 +2121,8 @@
range_tree_fini();
unique_fini();
refcount_fini();
-
+ scan_fini();
+
avl_destroy(&spa_namespace_avl);
avl_destroy(&spa_spare_avl);
avl_destroy(&spa_l2cache_avl);
@@ -2220,6 +2223,7 @@
spa->spa_scan_pass_scrub_pause = 0;
spa->spa_scan_pass_scrub_spent_paused = 0;
spa->spa_scan_pass_exam = 0;
+ spa->spa_scan_pass_issued = 0;
vdev_scan_stat_init(spa->spa_root_vdev);
}
@@ -2237,18 +2241,20 @@
/* data stored on disk */
ps->pss_func = scn->scn_phys.scn_func;
+ ps->pss_state = scn->scn_phys.scn_state;
ps->pss_start_time = scn->scn_phys.scn_start_time;
ps->pss_end_time = scn->scn_phys.scn_end_time;
ps->pss_to_examine = scn->scn_phys.scn_to_examine;
- ps->pss_examined = scn->scn_phys.scn_examined;
ps->pss_to_process = scn->scn_phys.scn_to_process;
ps->pss_processed = scn->scn_phys.scn_processed;
ps->pss_errors = scn->scn_phys.scn_errors;
- ps->pss_state = scn->scn_phys.scn_state;
-
+ ps->pss_examined = scn->scn_phys.scn_examined;
+ ps->pss_issued =
+ scn->scn_issued_before_pass + spa->spa_scan_pass_issued;
/* data not stored on disk */
ps->pss_pass_start = spa->spa_scan_pass_start;
ps->pss_pass_exam = spa->spa_scan_pass_exam;
+ ps->pss_pass_issued = spa->spa_scan_pass_issued;
ps->pss_pass_scrub_pause = spa->spa_scan_pass_scrub_pause;
ps->pss_pass_scrub_spent_paused = spa->spa_scan_pass_scrub_spent_paused;
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h
@@ -58,11 +58,13 @@
typedef struct arc_buf_hdr arc_buf_hdr_t;
typedef struct arc_buf arc_buf_t;
-typedef void arc_done_func_t(zio_t *zio, arc_buf_t *buf, void *priv);
+typedef void arc_read_done_func_t(zio_t *zio, const zbookmark_phys_t *zb,
+ const blkptr_t *bp, arc_buf_t *buf, void *priv);
+typedef void arc_write_done_func_t(zio_t *zio, arc_buf_t *buf, void *priv);
/* generic arc_done_func_t's which you can use */
-arc_done_func_t arc_bcopy_func;
-arc_done_func_t arc_getbuf_func;
+arc_read_done_func_t arc_bcopy_func;
+arc_read_done_func_t arc_getbuf_func;
typedef enum arc_flags
{
@@ -75,35 +77,36 @@
ARC_FLAG_CACHED = 1 << 3, /* I/O was in cache */
ARC_FLAG_L2CACHE = 1 << 4, /* cache in L2ARC */
ARC_FLAG_PREDICTIVE_PREFETCH = 1 << 5, /* I/O from zfetch */
+ ARC_FLAG_PRESCIENT_PREFETCH = 1 << 6, /* long min lifespan */
/*
* Private ARC flags. These flags are private ARC only flags that
* will show up in b_flags in the arc_hdr_buf_t. These flags should
* only be set by ARC code.
*/
- ARC_FLAG_IN_HASH_TABLE = 1 << 6, /* buffer is hashed */
- ARC_FLAG_IO_IN_PROGRESS = 1 << 7, /* I/O in progress */
- ARC_FLAG_IO_ERROR = 1 << 8, /* I/O failed for buf */
- ARC_FLAG_INDIRECT = 1 << 9, /* indirect block */
+ ARC_FLAG_IN_HASH_TABLE = 1 << 7, /* buffer is hashed */
+ ARC_FLAG_IO_IN_PROGRESS = 1 << 8, /* I/O in progress */
+ ARC_FLAG_IO_ERROR = 1 << 9, /* I/O failed for buf */
+ ARC_FLAG_INDIRECT = 1 << 10, /* indirect block */
/* Indicates that block was read with ASYNC priority. */
- ARC_FLAG_PRIO_ASYNC_READ = 1 << 10,
- ARC_FLAG_L2_WRITING = 1 << 11, /* write in progress */
- ARC_FLAG_L2_EVICTED = 1 << 12, /* evicted during I/O */
- ARC_FLAG_L2_WRITE_HEAD = 1 << 13, /* head of write list */
+ ARC_FLAG_PRIO_ASYNC_READ = 1 << 11,
+ ARC_FLAG_L2_WRITING = 1 << 12, /* write in progress */
+ ARC_FLAG_L2_EVICTED = 1 << 13, /* evicted during I/O */
+ ARC_FLAG_L2_WRITE_HEAD = 1 << 14, /* head of write list */
/* indicates that the buffer contains metadata (otherwise, data) */
- ARC_FLAG_BUFC_METADATA = 1 << 14,
+ ARC_FLAG_BUFC_METADATA = 1 << 15,
/* Flags specifying whether optional hdr struct fields are defined */
- ARC_FLAG_HAS_L1HDR = 1 << 15,
- ARC_FLAG_HAS_L2HDR = 1 << 16,
+ ARC_FLAG_HAS_L1HDR = 1 << 16,
+ ARC_FLAG_HAS_L2HDR = 1 << 17,
/*
* Indicates the arc_buf_hdr_t's b_pdata matches the on-disk data.
* This allows the l2arc to use the blkptr's checksum to verify
* the data without having to store the checksum in the hdr.
*/
- ARC_FLAG_COMPRESSED_ARC = 1 << 17,
- ARC_FLAG_SHARED_DATA = 1 << 18,
+ ARC_FLAG_COMPRESSED_ARC = 1 << 18,
+ ARC_FLAG_SHARED_DATA = 1 << 19,
/*
* The arc buffer's compression mode is stored in the top 7 bits of the
@@ -179,12 +182,12 @@
#endif
int arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
- arc_done_func_t *done, void *priv, zio_priority_t priority, int flags,
- arc_flags_t *arc_flags, const zbookmark_phys_t *zb);
+ arc_read_done_func_t *done, void *priv, zio_priority_t priority,
+ int flags, arc_flags_t *arc_flags, const zbookmark_phys_t *zb);
zio_t *arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, const zio_prop_t *zp,
- arc_done_func_t *ready, arc_done_func_t *child_ready,
- arc_done_func_t *physdone, arc_done_func_t *done,
+ arc_write_done_func_t *ready, arc_write_done_func_t *child_ready,
+ arc_write_done_func_t *physdone, arc_write_done_func_t *done,
void *priv, zio_priority_t priority, int zio_flags,
const zbookmark_phys_t *zb);
void arc_freed(spa_t *spa, const blkptr_t *bp);
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h
@@ -76,6 +76,7 @@
typedef struct zfs_all_blkstats {
zfs_blkstat_t zab_type[DN_MAX_LEVELS + 1][DMU_OT_TOTAL + 1];
+ kmutex_t zab_lock;
} zfs_all_blkstats_t;
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_scan.h
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_scan.h
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_scan.h
@@ -107,24 +107,58 @@
typedef struct dsl_scan {
struct dsl_pool *scn_dp;
- boolean_t scn_suspending;
uint64_t scn_restart_txg;
uint64_t scn_done_txg;
uint64_t scn_sync_start_time;
- zio_t *scn_zio_root;
+ uint64_t scn_issued_before_pass;
/* for freeing blocks */
boolean_t scn_is_bptree;
boolean_t scn_async_destroying;
boolean_t scn_async_stalled;
uint64_t scn_async_block_min_time_ms;
+ /* flags and stats for controlling scan state */
+ boolean_t scn_is_sorted; /* doing sequential scan */
+ boolean_t scn_clearing; /* scan is issuing sequential extents */
+ boolean_t scn_checkpointing; /* scan is issuing all queued extents */
+ boolean_t scn_suspending; /* scan is suspending until next txg */
+ uint64_t scn_last_checkpoint; /* time of last checkpoint */
- /* for debugging / information */
- uint64_t scn_visited_this_txg;
+ /* members for thread synchronization */
+ zio_t *scn_zio_root; /* root zio for waiting on IO */
+ taskq_t *scn_taskq; /* task queue for issuing extents */
- dsl_scan_phys_t scn_phys;
+ /* for controlling scan prefetch, protected by spa_scrub_lock */
+ boolean_t scn_prefetch_stop; /* prefetch should stop */
+ zbookmark_phys_t scn_prefetch_bookmark; /* prefetch start bookmark */
+ avl_tree_t scn_prefetch_queue; /* priority queue of prefetch IOs */
+ uint64_t scn_maxinflight_bytes; /* max bytes in flight for poool */
+
+ /* per txg statistics */
+ uint64_t scn_visited_this_txg; /* total bps visited this txg */
+ uint64_t scn_holes_this_txg;
+ uint64_t scn_lt_min_this_txg;
+ uint64_t scn_gt_max_this_txg;
+ uint64_t scn_ddt_contained_this_txg;
+ uint64_t scn_objsets_visited_this_txg;
+ uint64_t scn_avg_seg_size_this_txg;
+ uint64_t scn_segs_this_txg;
+ uint64_t scn_avg_zio_size_this_txg;
+ uint64_t scn_zios_this_txg;
+
+ /* members needed for syncing scan status to disk */
+ dsl_scan_phys_t scn_phys; /* on disk representation of scan */
+ dsl_scan_phys_t scn_phys_cached;
+ avl_tree_t scn_queue; /* queue of datasets to scan */
+ uint64_t scn_bytes_pending; /* outstanding data to issue */
} dsl_scan_t;
+typedef struct dsl_scan_io_queue dsl_scan_io_queue_t;
+
+void dsl_scan_global_init(void);
+
+void scan_init(void);
+void scan_fini(void);
int dsl_scan_init(struct dsl_pool *dp, uint64_t txg);
void dsl_scan_fini(struct dsl_pool *dp);
void dsl_scan_sync(struct dsl_pool *, dmu_tx_t *);
@@ -143,6 +177,9 @@
struct dmu_tx *tx);
boolean_t dsl_scan_active(dsl_scan_t *scn);
boolean_t dsl_scan_is_paused_scrub(const dsl_scan_t *scn);
+void dsl_scan_freed(spa_t *spa, const blkptr_t *bp);
+void dsl_scan_io_queue_destroy(dsl_scan_io_queue_t *queue);
+void dsl_scan_io_queue_vdev_xfer(vdev_t *svd, vdev_t *tvd);
#ifdef __cplusplus
}
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/range_tree.h
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/range_tree.h
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/range_tree.h
@@ -51,6 +51,9 @@
range_tree_ops_t *rt_ops;
void *rt_arg;
+ /* rt_avl_compare should only be set it rt_arg is an AVL tree */
+ uint64_t rt_gap; /* allowable inter-segment gap */
+ int (*rt_avl_compare)(const void *, const void *);
/*
* The rt_histogram maintains a histogram of ranges. Each bucket,
* rt_histogram[i], contains the number of ranges whose size is:
@@ -64,6 +67,7 @@
avl_node_t rs_pp_node; /* AVL picker-private node */
uint64_t rs_start; /* starting offset of this segment */
uint64_t rs_end; /* ending offset (non-inclusive) */
+ uint64_t rs_fill; /* actual fill if gap mode is on */
} range_seg_t;
struct range_tree_ops {
@@ -78,9 +82,14 @@
void range_tree_init(void);
void range_tree_fini(void);
-range_tree_t *range_tree_create(range_tree_ops_t *ops, void *arg);
+range_tree_t *range_tree_create_impl(range_tree_ops_t *ops, void *arg,
+ int (*avl_compare)(const void*, const void*), uint64_t gap);
+ range_tree_t *range_tree_create(range_tree_ops_t *ops, void *arg);
void range_tree_destroy(range_tree_t *rt);
boolean_t range_tree_contains(range_tree_t *rt, uint64_t start, uint64_t size);
+range_seg_t *range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size);
+void range_tree_resize_segment(range_tree_t *rt, range_seg_t *rs,
+ uint64_t newstart, uint64_t newsize);
uint64_t range_tree_space(range_tree_t *rt);
boolean_t range_tree_is_empty(range_tree_t *rt);
void range_tree_verify(range_tree_t *rt, uint64_t start, uint64_t size);
@@ -89,10 +98,27 @@
void range_tree_add(void *arg, uint64_t start, uint64_t size);
void range_tree_remove(void *arg, uint64_t start, uint64_t size);
+void range_tree_remove_fill(range_tree_t *rt, uint64_t start, uint64_t size);
+void range_tree_adjust_fill(range_tree_t *rt, range_seg_t *rs, int64_t delta);
void range_tree_clear(range_tree_t *rt, uint64_t start, uint64_t size);
void range_tree_vacate(range_tree_t *rt, range_tree_func_t *func, void *arg);
void range_tree_walk(range_tree_t *rt, range_tree_func_t *func, void *arg);
+range_seg_t *range_tree_first(range_tree_t *rt);
+
+void rt_avl_create(range_tree_t *rt, void *arg);
+void rt_avl_destroy(range_tree_t *rt, void *arg);
+void rt_avl_add(range_tree_t *rt, range_seg_t *rs, void *arg);
+void rt_avl_remove(range_tree_t *rt, range_seg_t *rs, void *arg);
+void rt_avl_vacate(range_tree_t *rt, void *arg);
+extern struct range_tree_ops rt_avl_ops;
+
+void rt_avl_create(range_tree_t *rt, void *arg);
+void rt_avl_destroy(range_tree_t *rt, void *arg);
+void rt_avl_add(range_tree_t *rt, range_seg_t *rs, void *arg);
+void rt_avl_remove(range_tree_t *rt, range_seg_t *rs, void *arg);
+void rt_avl_vacate(range_tree_t *rt, void *arg);
+extern struct range_tree_ops rt_avl_ops;
#ifdef __cplusplus
}
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h
@@ -257,7 +257,8 @@
boolean_t spa_extreme_rewind; /* rewind past deferred frees */
uint64_t spa_last_io; /* lbolt of last non-scan I/O */
kmutex_t spa_scrub_lock; /* resilver/scrub lock */
- uint64_t spa_scrub_inflight; /* in-flight scrub I/Os */
+ uint64_t spa_scrub_inflight; /* in-flight scrub bytes */
+ uint64_t spa_load_verify_ios; /* in-flight verifications IOs */
kcondvar_t spa_scrub_io_cv; /* scrub I/O completion */
uint8_t spa_scrub_active; /* active or suspended? */
uint8_t spa_scrub_type; /* type of scrub we're doing */
@@ -268,6 +269,7 @@
uint64_t spa_scan_pass_scrub_pause; /* scrub pause time */
uint64_t spa_scan_pass_scrub_spent_paused; /* total paused */
uint64_t spa_scan_pass_exam; /* examined bytes per pass */
+ uint64_t spa_scan_pass_issued; /* issued bytes per pass */
kmutex_t spa_async_lock; /* protect async state */
kthread_t *spa_async_thread; /* thread doing async task */
kthread_t *spa_async_thread_vd; /* thread doing vd async task */
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h
@@ -71,6 +71,7 @@
extern boolean_t vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t d,
uint64_t txg, uint64_t size);
extern boolean_t vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t d);
+extern boolean_t vdev_dtl_need_resilver(vdev_t *vd, uint64_t off, size_t size);
extern void vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg,
int scrub_done);
extern boolean_t vdev_dtl_required(vdev_t *vd);
@@ -135,6 +136,7 @@
extern void vdev_queue_fini(vdev_t *vd);
extern zio_t *vdev_queue_io(zio_t *zio);
extern void vdev_queue_io_done(zio_t *zio);
+extern void vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority);
extern int vdev_queue_length(vdev_t *vd);
extern uint64_t vdev_queue_lastoffset(vdev_t *vd);
extern void vdev_queue_register_lastoffset(vdev_t *vd, zio_t *zio);
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h
@@ -71,6 +71,7 @@
typedef void vdev_io_start_func_t(zio_t *zio);
typedef void vdev_io_done_func_t(zio_t *zio);
typedef void vdev_state_change_func_t(vdev_t *vd, int, int);
+typedef boolean_t vdev_need_resilver_func_t(vdev_t *vd, uint64_t, size_t);
typedef void vdev_hold_func_t(vdev_t *vd);
typedef void vdev_rele_func_t(vdev_t *vd);
@@ -86,6 +87,7 @@
vdev_io_start_func_t *vdev_op_io_start;
vdev_io_done_func_t *vdev_op_io_done;
vdev_state_change_func_t *vdev_op_state_change;
+ vdev_need_resilver_func_t *vdev_op_need_resilver;
vdev_hold_func_t *vdev_op_hold;
vdev_rele_func_t *vdev_op_rele;
vdev_remap_func_t *vdev_op_remap;
@@ -293,6 +295,13 @@
*/
uint64_t vdev_async_write_queue_depth;
uint64_t vdev_max_async_write_queue_depth;
+
+ /*
+ * Protects the vdev_scan_io_queue field itself as well as the
+ * structure's contents (when present).
+ */
+ kmutex_t vdev_scan_io_queue_lock;
+ struct dsl_scan_io_queue *vdev_scan_io_queue;
/*
* Leaf vdev state.
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h
@@ -593,6 +593,8 @@
extern void zio_vdev_io_reissue(zio_t *zio);
extern void zio_vdev_io_redone(zio_t *zio);
+extern void zio_change_priority(zio_t *pio, zio_priority_t priority);
+
extern void zio_checksum_verified(zio_t *zio);
extern int zio_worst_error(int e1, int e2);
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
@@ -559,6 +559,8 @@
mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&vd->vdev_queue_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&vd->vdev_scan_io_queue_lock, NULL, MUTEX_DEFAULT, NULL);
+
for (int t = 0; t < DTL_TYPES; t++) {
vd->vdev_dtl[t] = range_tree_create(NULL, NULL);
}
@@ -832,6 +834,18 @@
spa_t *spa = vd->vdev_spa;
/*
+ * Scan queues are normally destroyed at the end of a scan. If the
+ * queue exists here, that implies the vdev is being removed while
+ * the scan is still running.
+ */
+ if (vd->vdev_scan_io_queue != NULL) {
+ mutex_enter(&vd->vdev_scan_io_queue_lock);
+ dsl_scan_io_queue_destroy(vd->vdev_scan_io_queue);
+ vd->vdev_scan_io_queue = NULL;
+ mutex_exit(&vd->vdev_scan_io_queue_lock);
+ }
+
+ /*
* vdev_free() implies closing the vdev first. This is simpler than
* trying to ensure complicated semantics for all callers.
*/
@@ -920,6 +934,7 @@
mutex_destroy(&vd->vdev_dtl_lock);
mutex_destroy(&vd->vdev_stat_lock);
mutex_destroy(&vd->vdev_probe_lock);
+ mutex_destroy(&vd->vdev_scan_io_queue_lock);
if (vd == spa->spa_root_vdev)
spa->spa_root_vdev = NULL;
@@ -996,6 +1011,8 @@
tvd->vdev_islog = svd->vdev_islog;
svd->vdev_islog = 0;
+
+ dsl_scan_io_queue_vdev_xfer(svd, tvd);
}
static void
@@ -2286,6 +2303,21 @@
mutex_exit(&vd->vdev_dtl_lock);
return (empty);
+}
+
+/*
+ * Returns B_TRUE if vdev determines offset needs to be resilvered.
+ */
+boolean_t
+vdev_dtl_need_resilver(vdev_t *vd, uint64_t offset, size_t psize)
+{
+ ASSERT(vd != vd->vdev_spa->spa_root_vdev);
+
+ if (vd->vdev_ops->vdev_op_need_resilver == NULL ||
+ vd->vdev_ops->vdev_op_leaf)
+ return (B_TRUE);
+
+ return (vd->vdev_ops->vdev_op_need_resilver(vd, offset, psize));
}
/*
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c
@@ -837,6 +837,7 @@
vdev_disk_io_start,
vdev_disk_io_done,
NULL,
+ NULL,
vdev_disk_hold,
vdev_disk_rele,
NULL,
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c
@@ -267,6 +267,7 @@
vdev_file_io_start,
vdev_file_io_done,
NULL,
+ NULL,
vdev_file_hold,
vdev_file_rele,
NULL,
@@ -285,6 +286,7 @@
vdev_default_asize,
vdev_file_io_start,
vdev_file_io_done,
+ NULL,
NULL,
vdev_file_hold,
vdev_file_rele,
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c
@@ -1147,6 +1147,7 @@
vdev_geom_io_start,
vdev_geom_io_done,
NULL,
+ NULL,
vdev_geom_hold,
vdev_geom_rele,
NULL,
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect.c
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect.c
@@ -1111,6 +1111,7 @@
NULL,
NULL,
NULL,
+ NULL,
vdev_indirect_remap,
VDEV_TYPE_INDIRECT, /* name of this vdev type */
B_FALSE /* leaf vdev */
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c
@@ -722,6 +722,7 @@
NULL,
NULL,
NULL,
+ NULL,
VDEV_TYPE_MIRROR, /* name of this vdev type */
B_FALSE /* not a leaf vdev */
};
@@ -736,6 +737,7 @@
NULL,
NULL,
NULL,
+ NULL,
VDEV_TYPE_REPLACING, /* name of this vdev type */
B_FALSE /* not a leaf vdev */
};
@@ -747,6 +749,7 @@
vdev_mirror_io_start,
vdev_mirror_io_done,
vdev_mirror_state_change,
+ NULL,
NULL,
NULL,
NULL,
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c
@@ -90,6 +90,7 @@
NULL,
NULL,
NULL,
+ NULL,
VDEV_TYPE_MISSING, /* name of this vdev type */
B_TRUE /* leaf vdev */
};
@@ -100,6 +101,7 @@
vdev_default_asize,
vdev_missing_io_start,
vdev_missing_io_done,
+ NULL,
NULL,
NULL,
NULL,
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c
@@ -175,7 +175,7 @@
* we include spans of optional I/Os to aid aggregation at the disk even when
* they aren't able to help us aggregate at this level.
*/
-int zfs_vdev_aggregation_limit = SPA_OLD_MAXBLOCKSIZE;
+int zfs_vdev_aggregation_limit = 1 << 20;
int zfs_vdev_read_gap_limit = 32 << 10;
int zfs_vdev_write_gap_limit = 4 << 10;
@@ -933,6 +933,48 @@
zio_execute(nio);
}
mutex_enter(&vq->vq_lock);
+ }
+
+ mutex_exit(&vq->vq_lock);
+}
+
+void
+vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority)
+{
+ vdev_queue_t *vq = &zio->io_vd->vdev_queue;
+ avl_tree_t *tree;
+
+ ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
+ ASSERT3U(priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
+
+ if (zio->io_type == ZIO_TYPE_READ) {
+ if (priority != ZIO_PRIORITY_SYNC_READ &&
+ priority != ZIO_PRIORITY_ASYNC_READ &&
+ priority != ZIO_PRIORITY_SCRUB)
+ priority = ZIO_PRIORITY_ASYNC_READ;
+ } else {
+ ASSERT(zio->io_type == ZIO_TYPE_WRITE);
+ if (priority != ZIO_PRIORITY_SYNC_WRITE &&
+ priority != ZIO_PRIORITY_ASYNC_WRITE)
+ priority = ZIO_PRIORITY_ASYNC_WRITE;
+ }
+
+ mutex_enter(&vq->vq_lock);
+
+ /*
+ * If the zio is in none of the queues we can simply change
+ * the priority. If the zio is waiting to be submitted we must
+ * remove it from the queue and re-insert it with the new priority.
+ * Otherwise, the zio is currently active and we cannot change its
+ * priority.
+ */
+ tree = vdev_queue_class_tree(vq, zio->io_priority);
+ if (avl_find(tree, zio, NULL) == zio) {
+ avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio);
+ zio->io_priority = priority;
+ avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio);
+ } else if (avl_find(&vq->vq_active_tree, zio, NULL) != zio) {
+ zio->io_priority = priority;
}
mutex_exit(&vq->vq_lock);
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c
@@ -2584,6 +2584,44 @@
vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
}
+/*
+ * Determine if any portion of the provided block resides on a child vdev
+ * with a dirty DTL and therefore needs to be resilvered. The function
+ * assumes that at least one DTL is dirty which imples that full stripe
+ * width blocks must be resilvered.
+ */
+static boolean_t
+vdev_raidz_need_resilver(vdev_t *vd, uint64_t offset, size_t psize)
+{
+ uint64_t dcols = vd->vdev_children;
+ uint64_t nparity = vd->vdev_nparity;
+ uint64_t ashift = vd->vdev_top->vdev_ashift;
+ /* The starting RAIDZ (parent) vdev sector of the block. */
+ uint64_t b = offset >> ashift;
+ /* The zio's size in units of the vdev's minimum sector size. */
+ uint64_t s = ((psize - 1) >> ashift) + 1;
+ /* The first column for this stripe. */
+ uint64_t f = b % dcols;
+
+ if (s + nparity >= dcols)
+ return (B_TRUE);
+
+ for (uint64_t c = 0; c < s + nparity; c++) {
+ uint64_t devidx = (f + c) % dcols;
+ vdev_t *cvd = vd->vdev_child[devidx];
+
+ /*
+ * dsl_scan_need_resilver() already checked vd with
+ * vdev_dtl_contains(). So here just check cvd with
+ * vdev_dtl_empty(), cheaper and a good approximation.
+ */
+ if (!vdev_dtl_empty(cvd, DTL_PARTIAL))
+ return (B_TRUE);
+ }
+
+ return (B_FALSE);
+}
+
vdev_ops_t vdev_raidz_ops = {
vdev_raidz_open,
vdev_raidz_close,
@@ -2591,6 +2629,7 @@
vdev_raidz_io_start,
vdev_raidz_io_done,
vdev_raidz_state_change,
+ vdev_raidz_need_resilver,
NULL,
NULL,
NULL,
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c
@@ -150,6 +150,7 @@
NULL,
NULL,
NULL,
+ NULL,
VDEV_TYPE_ROOT, /* name of this vdev type */
B_FALSE /* not a leaf vdev */
};
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c
@@ -1051,7 +1051,7 @@
}
err = zap_add(os, intoobj, za.za_name,
8, 1, &value, tx);
- if (err)
+ if (err != 0)
break;
}
zap_cursor_fini(&zc);
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
@@ -41,6 +41,7 @@
#include <sys/trim_map.h>
#include <sys/blkptr.h>
#include <sys/zfeature.h>
+#include <sys/dsl_scan.h>
#include <sys/metaslab_impl.h>
#include <sys/abd.h>
@@ -438,6 +439,8 @@
{
list_t *cl = &pio->io_child_list;
+ ASSERT(MUTEX_HELD(&pio->io_lock));
+
*zl = (*zl == NULL) ? list_head(cl) : list_next(cl, *zl);
if (*zl == NULL)
return (NULL);
@@ -472,8 +475,8 @@
zl->zl_parent = pio;
zl->zl_child = cio;
- mutex_enter(&cio->io_lock);
mutex_enter(&pio->io_lock);
+ mutex_enter(&cio->io_lock);
ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0);
@@ -486,8 +489,8 @@
pio->io_child_count++;
cio->io_parent_count++;
- mutex_exit(&pio->io_lock);
mutex_exit(&cio->io_lock);
+ mutex_exit(&pio->io_lock);
}
static void
@@ -496,8 +499,8 @@
ASSERT(zl->zl_parent == pio);
ASSERT(zl->zl_child == cio);
- mutex_enter(&cio->io_lock);
mutex_enter(&pio->io_lock);
+ mutex_enter(&cio->io_lock);
list_remove(&pio->io_child_list, zl);
list_remove(&cio->io_parent_list, zl);
@@ -505,9 +508,8 @@
pio->io_child_count--;
cio->io_parent_count--;
- mutex_exit(&pio->io_lock);
mutex_exit(&cio->io_lock);
-
+ mutex_exit(&pio->io_lock);
kmem_cache_free(zio_link_cache, zl);
}
@@ -988,6 +990,7 @@
metaslab_check_free(spa, bp);
arc_freed(spa, bp);
+ dsl_scan_freed(spa, bp);
if (zfs_trim_enabled)
stage |= ZIO_STAGE_ISSUE_ASYNC | ZIO_STAGE_VDEV_IO_START |
@@ -1865,14 +1868,16 @@
* cannot be affected by any side effects of reexecuting 'cio'.
*/
zio_link_t *zl = NULL;
+ mutex_enter(&pio->io_lock);
for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) {
cio_next = zio_walk_children(pio, &zl);
- mutex_enter(&pio->io_lock);
for (int w = 0; w < ZIO_WAIT_TYPES; w++)
pio->io_children[cio->io_child_type][w]++;
mutex_exit(&pio->io_lock);
zio_reexecute(cio);
+ mutex_enter(&pio->io_lock);
}
+ mutex_exit(&pio->io_lock);
/*
* Now that all children have been reexecuted, execute the parent.
@@ -3184,26 +3189,25 @@
}
}
- /*
- * We keep track of time-sensitive I/Os so that the scan thread
- * can quickly react to certain workloads. In particular, we care
- * about non-scrubbing, top-level reads and writes with the following
- * characteristics:
- * - synchronous writes of user data to non-slog devices
- * - any reads of user data
- * When these conditions are met, adjust the timestamp of spa_last_io
- * which allows the scan thread to adjust its workload accordingly.
- */
- if (!(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_bp != NULL &&
- vd == vd->vdev_top && !vd->vdev_islog &&
- zio->io_bookmark.zb_objset != DMU_META_OBJSET &&
- zio->io_txg != spa_syncing_txg(spa)) {
- uint64_t old = spa->spa_last_io;
- uint64_t new = ddi_get_lbolt64();
- if (old != new)
- (void) atomic_cas_64(&spa->spa_last_io, old, new);
- }
-
+ /*
+ * We keep track of time-sensitive I/Os so that the scan thread
+ * can quickly react to certain workloads. In particular, we care
+ * about non-scrubbing, top-level reads and writes with the following
+ * characteristics:
+ * - synchronous writes of user data to non-slog devices
+ * - any reads of user data
+ * When these conditions are met, adjust the timestamp of spa_last_io
+ * which allows the scan thread to adjust its workload accordingly.
+ */
+ if (!(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_bp != NULL &&
+ vd == vd->vdev_top && !vd->vdev_islog &&
+ zio->io_bookmark.zb_objset != DMU_META_OBJSET &&
+ zio->io_txg != spa_syncing_txg(spa)) {
+ uint64_t old = spa->spa_last_io;
+ uint64_t new = ddi_get_lbolt64();
+ if (old != new)
+ (void) atomic_cas_64(&spa->spa_last_io, old, new);
+ }
align = 1ULL << vd->vdev_top->vdev_ashift;
if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) &&
@@ -3350,6 +3354,35 @@
VERIFY(vdev_probe(vd, zio) == NULL);
return (ZIO_PIPELINE_CONTINUE);
+}
+
+/*
+ * This function is used to change the priority of an existing zio that is
+ * currently in-flight. This is used by the arc to upgrade priority in the
+ * event that a demand read is made for a block that is currently queued
+ * as a scrub or async read IO. Otherwise, the high priority read request
+ * would end up having to wait for the lower priority IO.
+ */
+void
+zio_change_priority(zio_t *pio, zio_priority_t priority)
+{
+ zio_t *cio, *cio_next;
+ zio_link_t *zl = NULL;
+
+ ASSERT3U(priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
+
+ if (pio->io_vd != NULL && pio->io_vd->vdev_ops->vdev_op_leaf) {
+ vdev_queue_change_io_priority(pio, priority);
+ } else {
+ pio->io_priority = priority;
+ }
+
+ mutex_enter(&pio->io_lock);
+ for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) {
+ cio_next = zio_walk_children(pio, &zl);
+ zio_change_priority(cio, priority);
+ }
+ mutex_exit(&pio->io_lock);
}
/*
Index: head/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h
+++ head/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h
@@ -760,7 +760,7 @@
uint64_t pss_start_time; /* scan start time */
uint64_t pss_end_time; /* scan end time */
uint64_t pss_to_examine; /* total bytes to scan */
- uint64_t pss_examined; /* total examined bytes */
+ uint64_t pss_examined; /* total bytes located by scanner */
uint64_t pss_to_process; /* total bytes to process */
uint64_t pss_processed; /* total processed bytes */
uint64_t pss_errors; /* scan errors */
@@ -771,6 +771,12 @@
uint64_t pss_pass_scrub_pause; /* pause time of a scurb pass */
/* cumulative time scrub spent paused, needed for rate calculation */
uint64_t pss_pass_scrub_spent_paused;
+
+ /* Sorted scrubbing new fields */
+ /* Stored on disk */
+ uint64_t pss_issued; /* total bytes checked by scanner */
+ /* Not stored on disk */
+ uint64_t pss_pass_issued; /* issued bytes per scan pass */
} pool_scan_stat_t;
typedef struct pool_removal_stat {
Index: head/sys/cddl/contrib/opensolaris/uts/common/sys/taskq.h
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/sys/taskq.h
+++ head/sys/cddl/contrib/opensolaris/uts/common/sys/taskq.h
@@ -72,6 +72,8 @@
#define TQ_NOALLOC 0x04 /* cannot allocate memory; may fail */
#define TQ_FRONT 0x08 /* Put task at the front of the queue */
+#define TASKQID_INVALID ((taskqid_t)0)
+
#ifdef _KERNEL
extern taskq_t *system_taskq;
@@ -91,6 +93,7 @@
void nulltask(void *);
void taskq_destroy(taskq_t *);
void taskq_wait(taskq_t *);
+void taskq_wait_id(taskq_t *, taskqid_t);
void taskq_suspend(taskq_t *);
int taskq_suspended(taskq_t *);
void taskq_resume(taskq_t *);
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Fri, Oct 17, 12:43 AM (5 h, 18 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
23810812
Default Alt Text
D15562.id.diff (193 KB)
Attached To
Mode
D15562: ZFS sorted scans
Attached
Detach File
Event Timeline
Log In to Comment