Changeset View
Changeset View
Standalone View
Standalone View
sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
Show All 35 Lines | |||||
#include <sys/zio_compress.h> | #include <sys/zio_compress.h> | ||||
#include <sys/zio_checksum.h> | #include <sys/zio_checksum.h> | ||||
#include <sys/dmu_objset.h> | #include <sys/dmu_objset.h> | ||||
#include <sys/arc.h> | #include <sys/arc.h> | ||||
#include <sys/ddt.h> | #include <sys/ddt.h> | ||||
#include <sys/trim_map.h> | #include <sys/trim_map.h> | ||||
#include <sys/blkptr.h> | #include <sys/blkptr.h> | ||||
#include <sys/zfeature.h> | #include <sys/zfeature.h> | ||||
#include <sys/dsl_scan.h> | |||||
#include <sys/metaslab_impl.h> | #include <sys/metaslab_impl.h> | ||||
#include <sys/abd.h> | #include <sys/abd.h> | ||||
SYSCTL_DECL(_vfs_zfs); | SYSCTL_DECL(_vfs_zfs); | ||||
SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO"); | SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO"); | ||||
#if defined(__amd64__) | #if defined(__amd64__) | ||||
static int zio_use_uma = 1; | static int zio_use_uma = 1; | ||||
#else | #else | ||||
▲ Show 20 Lines • Show All 381 Lines • ▼ Show 20 Lines | zio_walk_parents(zio_t *cio, zio_link_t **zl) | ||||
return ((*zl)->zl_parent); | return ((*zl)->zl_parent); | ||||
} | } | ||||
zio_t * | zio_t * | ||||
zio_walk_children(zio_t *pio, zio_link_t **zl) | zio_walk_children(zio_t *pio, zio_link_t **zl) | ||||
{ | { | ||||
list_t *cl = &pio->io_child_list; | list_t *cl = &pio->io_child_list; | ||||
ASSERT(MUTEX_HELD(&pio->io_lock)); | |||||
*zl = (*zl == NULL) ? list_head(cl) : list_next(cl, *zl); | *zl = (*zl == NULL) ? list_head(cl) : list_next(cl, *zl); | ||||
if (*zl == NULL) | if (*zl == NULL) | ||||
return (NULL); | return (NULL); | ||||
ASSERT((*zl)->zl_parent == pio); | ASSERT((*zl)->zl_parent == pio); | ||||
return ((*zl)->zl_child); | return ((*zl)->zl_child); | ||||
} | } | ||||
Show All 18 Lines | zio_add_child(zio_t *pio, zio_t *cio) | ||||
* Vdev I/Os can only have vdev children. | * Vdev I/Os can only have vdev children. | ||||
* The following ASSERT captures all of these constraints. | * The following ASSERT captures all of these constraints. | ||||
*/ | */ | ||||
ASSERT3S(cio->io_child_type, <=, pio->io_child_type); | ASSERT3S(cio->io_child_type, <=, pio->io_child_type); | ||||
zl->zl_parent = pio; | zl->zl_parent = pio; | ||||
zl->zl_child = cio; | zl->zl_child = cio; | ||||
mutex_enter(&cio->io_lock); | |||||
mutex_enter(&pio->io_lock); | mutex_enter(&pio->io_lock); | ||||
mutex_enter(&cio->io_lock); | |||||
ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0); | ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0); | ||||
for (int w = 0; w < ZIO_WAIT_TYPES; w++) | for (int w = 0; w < ZIO_WAIT_TYPES; w++) | ||||
pio->io_children[cio->io_child_type][w] += !cio->io_state[w]; | pio->io_children[cio->io_child_type][w] += !cio->io_state[w]; | ||||
list_insert_head(&pio->io_child_list, zl); | list_insert_head(&pio->io_child_list, zl); | ||||
list_insert_head(&cio->io_parent_list, zl); | list_insert_head(&cio->io_parent_list, zl); | ||||
pio->io_child_count++; | pio->io_child_count++; | ||||
cio->io_parent_count++; | cio->io_parent_count++; | ||||
mutex_exit(&pio->io_lock); | |||||
mutex_exit(&cio->io_lock); | mutex_exit(&cio->io_lock); | ||||
mutex_exit(&pio->io_lock); | |||||
} | } | ||||
static void | static void | ||||
zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl) | zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl) | ||||
{ | { | ||||
ASSERT(zl->zl_parent == pio); | ASSERT(zl->zl_parent == pio); | ||||
ASSERT(zl->zl_child == cio); | ASSERT(zl->zl_child == cio); | ||||
mutex_enter(&cio->io_lock); | |||||
mutex_enter(&pio->io_lock); | mutex_enter(&pio->io_lock); | ||||
mutex_enter(&cio->io_lock); | |||||
list_remove(&pio->io_child_list, zl); | list_remove(&pio->io_child_list, zl); | ||||
list_remove(&cio->io_parent_list, zl); | list_remove(&cio->io_parent_list, zl); | ||||
pio->io_child_count--; | pio->io_child_count--; | ||||
cio->io_parent_count--; | cio->io_parent_count--; | ||||
mutex_exit(&pio->io_lock); | |||||
mutex_exit(&cio->io_lock); | mutex_exit(&cio->io_lock); | ||||
mutex_exit(&pio->io_lock); | |||||
kmem_cache_free(zio_link_cache, zl); | kmem_cache_free(zio_link_cache, zl); | ||||
} | } | ||||
static boolean_t | static boolean_t | ||||
zio_wait_for_children(zio_t *zio, uint8_t childbits, enum zio_wait_type wait) | zio_wait_for_children(zio_t *zio, uint8_t childbits, enum zio_wait_type wait) | ||||
{ | { | ||||
boolean_t waiting = B_FALSE; | boolean_t waiting = B_FALSE; | ||||
▲ Show 20 Lines • Show All 464 Lines • ▼ Show 20 Lines | zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, | ||||
ASSERT(spa_syncing_txg(spa) == txg); | ASSERT(spa_syncing_txg(spa) == txg); | ||||
ASSERT(spa_sync_pass(spa) < zfs_sync_pass_deferred_free); | ASSERT(spa_sync_pass(spa) < zfs_sync_pass_deferred_free); | ||||
if (BP_IS_EMBEDDED(bp)) | if (BP_IS_EMBEDDED(bp)) | ||||
return (zio_null(pio, spa, NULL, NULL, NULL, 0)); | return (zio_null(pio, spa, NULL, NULL, NULL, 0)); | ||||
metaslab_check_free(spa, bp); | metaslab_check_free(spa, bp); | ||||
arc_freed(spa, bp); | arc_freed(spa, bp); | ||||
dsl_scan_freed(spa, bp); | |||||
if (zfs_trim_enabled) | if (zfs_trim_enabled) | ||||
stage |= ZIO_STAGE_ISSUE_ASYNC | ZIO_STAGE_VDEV_IO_START | | stage |= ZIO_STAGE_ISSUE_ASYNC | ZIO_STAGE_VDEV_IO_START | | ||||
ZIO_STAGE_VDEV_IO_ASSESS; | ZIO_STAGE_VDEV_IO_ASSESS; | ||||
/* | /* | ||||
* GANG and DEDUP blocks can induce a read (for the gang block header, | * GANG and DEDUP blocks can induce a read (for the gang block header, | ||||
* or the DDT), so issue them asynchronously so that this thread is | * or the DDT), so issue them asynchronously so that this thread is | ||||
* not tied up. | * not tied up. | ||||
▲ Show 20 Lines • Show All 861 Lines • ▼ Show 20 Lines | zio_reexecute(zio_t *pio) | ||||
/* | /* | ||||
* As we reexecute pio's children, new children could be created. | * As we reexecute pio's children, new children could be created. | ||||
* New children go to the head of pio's io_child_list, however, | * New children go to the head of pio's io_child_list, however, | ||||
* so we will (correctly) not reexecute them. The key is that | * so we will (correctly) not reexecute them. The key is that | ||||
* the remainder of pio's io_child_list, from 'cio_next' onward, | * the remainder of pio's io_child_list, from 'cio_next' onward, | ||||
* cannot be affected by any side effects of reexecuting 'cio'. | * cannot be affected by any side effects of reexecuting 'cio'. | ||||
*/ | */ | ||||
zio_link_t *zl = NULL; | zio_link_t *zl = NULL; | ||||
mutex_enter(&pio->io_lock); | |||||
for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) { | for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) { | ||||
cio_next = zio_walk_children(pio, &zl); | cio_next = zio_walk_children(pio, &zl); | ||||
mutex_enter(&pio->io_lock); | |||||
for (int w = 0; w < ZIO_WAIT_TYPES; w++) | for (int w = 0; w < ZIO_WAIT_TYPES; w++) | ||||
pio->io_children[cio->io_child_type][w]++; | pio->io_children[cio->io_child_type][w]++; | ||||
mutex_exit(&pio->io_lock); | mutex_exit(&pio->io_lock); | ||||
zio_reexecute(cio); | zio_reexecute(cio); | ||||
mutex_enter(&pio->io_lock); | |||||
} | } | ||||
mutex_exit(&pio->io_lock); | |||||
/* | /* | ||||
* Now that all children have been reexecuted, execute the parent. | * Now that all children have been reexecuted, execute the parent. | ||||
* We don't reexecute "The Godfather" I/O here as it's the | * We don't reexecute "The Godfather" I/O here as it's the | ||||
* responsibility of the caller to wait on it. | * responsibility of the caller to wait on it. | ||||
*/ | */ | ||||
if (!(pio->io_flags & ZIO_FLAG_GODFATHER)) { | if (!(pio->io_flags & ZIO_FLAG_GODFATHER)) { | ||||
pio->io_queued_timestamp = gethrtime(); | pio->io_queued_timestamp = gethrtime(); | ||||
▲ Show 20 Lines • Show All 1,295 Lines • ▼ Show 20 Lines | if (zio->io_type == ZIO_TYPE_WRITE) { | ||||
if (zio->io_vd->vdev_removing) { | if (zio->io_vd->vdev_removing) { | ||||
ASSERT(zio->io_flags & | ASSERT(zio->io_flags & | ||||
(ZIO_FLAG_PHYSICAL | ZIO_FLAG_SELF_HEAL | | (ZIO_FLAG_PHYSICAL | ZIO_FLAG_SELF_HEAL | | ||||
ZIO_FLAG_INDUCE_DAMAGE)); | ZIO_FLAG_INDUCE_DAMAGE)); | ||||
} | } | ||||
} | } | ||||
/* | /* | ||||
* We keep track of time-sensitive I/Os so that the scan thread | * We keep track of time-sensitive I/Os so that the scan thread | ||||
* can quickly react to certain workloads. In particular, we care | * can quickly react to certain workloads. In particular, we care | ||||
* about non-scrubbing, top-level reads and writes with the following | * about non-scrubbing, top-level reads and writes with the following | ||||
* characteristics: | * characteristics: | ||||
* - synchronous writes of user data to non-slog devices | * - synchronous writes of user data to non-slog devices | ||||
* - any reads of user data | * - any reads of user data | ||||
* When these conditions are met, adjust the timestamp of spa_last_io | * When these conditions are met, adjust the timestamp of spa_last_io | ||||
* which allows the scan thread to adjust its workload accordingly. | * which allows the scan thread to adjust its workload accordingly. | ||||
*/ | */ | ||||
if (!(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_bp != NULL && | if (!(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_bp != NULL && | ||||
vd == vd->vdev_top && !vd->vdev_islog && | vd == vd->vdev_top && !vd->vdev_islog && | ||||
zio->io_bookmark.zb_objset != DMU_META_OBJSET && | zio->io_bookmark.zb_objset != DMU_META_OBJSET && | ||||
zio->io_txg != spa_syncing_txg(spa)) { | zio->io_txg != spa_syncing_txg(spa)) { | ||||
uint64_t old = spa->spa_last_io; | uint64_t old = spa->spa_last_io; | ||||
uint64_t new = ddi_get_lbolt64(); | uint64_t new = ddi_get_lbolt64(); | ||||
if (old != new) | if (old != new) | ||||
(void) atomic_cas_64(&spa->spa_last_io, old, new); | (void) atomic_cas_64(&spa->spa_last_io, old, new); | ||||
} | } | ||||
align = 1ULL << vd->vdev_top->vdev_ashift; | align = 1ULL << vd->vdev_top->vdev_ashift; | ||||
if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) && | if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) && | ||||
P2PHASE(zio->io_size, align) != 0) { | P2PHASE(zio->io_size, align) != 0) { | ||||
/* Transform logical writes to be a full physical block size. */ | /* Transform logical writes to be a full physical block size. */ | ||||
uint64_t asize = P2ROUNDUP(zio->io_size, align); | uint64_t asize = P2ROUNDUP(zio->io_size, align); | ||||
abd_t *abuf = NULL; | abd_t *abuf = NULL; | ||||
if (zio->io_type == ZIO_TYPE_READ || | if (zio->io_type == ZIO_TYPE_READ || | ||||
▲ Show 20 Lines • Show All 130 Lines • ▼ Show 20 Lines | zio_vdev_io_done(zio_t *zio) | ||||
} | } | ||||
ops->vdev_op_io_done(zio); | ops->vdev_op_io_done(zio); | ||||
if (unexpected_error) | if (unexpected_error) | ||||
VERIFY(vdev_probe(vd, zio) == NULL); | VERIFY(vdev_probe(vd, zio) == NULL); | ||||
return (ZIO_PIPELINE_CONTINUE); | return (ZIO_PIPELINE_CONTINUE); | ||||
} | |||||
/* | |||||
* This function is used to change the priority of an existing zio that is | |||||
* currently in-flight. This is used by the arc to upgrade priority in the | |||||
* event that a demand read is made for a block that is currently queued | |||||
* as a scrub or async read IO. Otherwise, the high priority read request | |||||
* would end up having to wait for the lower priority IO. | |||||
*/ | |||||
void | |||||
zio_change_priority(zio_t *pio, zio_priority_t priority) | |||||
{ | |||||
zio_t *cio, *cio_next; | |||||
zio_link_t *zl = NULL; | |||||
ASSERT3U(priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); | |||||
if (pio->io_vd != NULL && pio->io_vd->vdev_ops->vdev_op_leaf) { | |||||
vdev_queue_change_io_priority(pio, priority); | |||||
} else { | |||||
pio->io_priority = priority; | |||||
} | |||||
mutex_enter(&pio->io_lock); | |||||
for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) { | |||||
cio_next = zio_walk_children(pio, &zl); | |||||
zio_change_priority(cio, priority); | |||||
} | |||||
mutex_exit(&pio->io_lock); | |||||
} | } | ||||
/* | /* | ||||
* For non-raidz ZIOs, we can just copy aside the bad data read from the | * For non-raidz ZIOs, we can just copy aside the bad data read from the | ||||
* disk, and use that to finish the checksum ereport later. | * disk, and use that to finish the checksum ereport later. | ||||
*/ | */ | ||||
static void | static void | ||||
zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr, | zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr, | ||||
▲ Show 20 Lines • Show All 853 Lines • Show Last 20 Lines |