Changeset View
Changeset View
Standalone View
Standalone View
sys/contrib/openzfs/module/zfs/zio.c
Show First 20 Lines • Show All 2,475 Lines • ▼ Show 20 Lines | |||||
* | * | ||||
* In all cases, the gang tree allows complete recovery from partial failure. | * In all cases, the gang tree allows complete recovery from partial failure. | ||||
* ========================================================================== | * ========================================================================== | ||||
*/ | */ | ||||
static void | static void | ||||
zio_gang_issue_func_done(zio_t *zio) | zio_gang_issue_func_done(zio_t *zio) | ||||
{ | { | ||||
abd_put(zio->io_abd); | abd_free(zio->io_abd); | ||||
} | } | ||||
static zio_t * | static zio_t * | ||||
zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, | zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, | ||||
uint64_t offset) | uint64_t offset) | ||||
{ | { | ||||
if (gn != NULL) | if (gn != NULL) | ||||
return (pio); | return (pio); | ||||
Show All 27 Lines | if (gn != NULL) { | ||||
* this is just good hygiene.) | * this is just good hygiene.) | ||||
*/ | */ | ||||
if (gn != pio->io_gang_leader->io_gang_tree) { | if (gn != pio->io_gang_leader->io_gang_tree) { | ||||
abd_t *buf = abd_get_offset(data, offset); | abd_t *buf = abd_get_offset(data, offset); | ||||
zio_checksum_compute(zio, BP_GET_CHECKSUM(bp), | zio_checksum_compute(zio, BP_GET_CHECKSUM(bp), | ||||
buf, BP_GET_PSIZE(bp)); | buf, BP_GET_PSIZE(bp)); | ||||
abd_put(buf); | abd_free(buf); | ||||
} | } | ||||
/* | /* | ||||
* If we are here to damage data for testing purposes, | * If we are here to damage data for testing purposes, | ||||
* leave the GBH alone so that we can detect the damage. | * leave the GBH alone so that we can detect the damage. | ||||
*/ | */ | ||||
if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE) | if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE) | ||||
zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; | zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; | ||||
} else { | } else { | ||||
▲ Show 20 Lines • Show All 111 Lines • ▼ Show 20 Lines | zio_gang_tree_assemble_done(zio_t *zio) | ||||
/* this ABD was created from a linear buf in zio_gang_tree_assemble */ | /* this ABD was created from a linear buf in zio_gang_tree_assemble */ | ||||
if (BP_SHOULD_BYTESWAP(bp)) | if (BP_SHOULD_BYTESWAP(bp)) | ||||
byteswap_uint64_array(abd_to_buf(zio->io_abd), zio->io_size); | byteswap_uint64_array(abd_to_buf(zio->io_abd), zio->io_size); | ||||
ASSERT3P(abd_to_buf(zio->io_abd), ==, gn->gn_gbh); | ASSERT3P(abd_to_buf(zio->io_abd), ==, gn->gn_gbh); | ||||
ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); | ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); | ||||
ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); | ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); | ||||
abd_put(zio->io_abd); | abd_free(zio->io_abd); | ||||
for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { | for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { | ||||
blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; | blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; | ||||
if (!BP_IS_GANG(gbp)) | if (!BP_IS_GANG(gbp)) | ||||
continue; | continue; | ||||
zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]); | zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]); | ||||
} | } | ||||
} | } | ||||
▲ Show 20 Lines • Show All 107 Lines • ▼ Show 20 Lines | |||||
zio_write_gang_done(zio_t *zio) | zio_write_gang_done(zio_t *zio) | ||||
{ | { | ||||
/* | /* | ||||
* The io_abd field will be NULL for a zio with no data. The io_flags | * The io_abd field will be NULL for a zio with no data. The io_flags | ||||
* will initially have the ZIO_FLAG_NODATA bit flag set, but we can't | * will initially have the ZIO_FLAG_NODATA bit flag set, but we can't | ||||
* check for it here as it is cleared in zio_ready. | * check for it here as it is cleared in zio_ready. | ||||
*/ | */ | ||||
if (zio->io_abd != NULL) | if (zio->io_abd != NULL) | ||||
abd_put(zio->io_abd); | abd_free(zio->io_abd); | ||||
} | } | ||||
static zio_t * | static zio_t * | ||||
zio_write_gang_block(zio_t *pio) | zio_write_gang_block(zio_t *pio, metaslab_class_t *mc) | ||||
{ | { | ||||
spa_t *spa = pio->io_spa; | spa_t *spa = pio->io_spa; | ||||
metaslab_class_t *mc = spa_normal_class(spa); | |||||
blkptr_t *bp = pio->io_bp; | blkptr_t *bp = pio->io_bp; | ||||
zio_t *gio = pio->io_gang_leader; | zio_t *gio = pio->io_gang_leader; | ||||
zio_t *zio; | zio_t *zio; | ||||
zio_gang_node_t *gn, **gnpp; | zio_gang_node_t *gn, **gnpp; | ||||
zio_gbh_phys_t *gbh; | zio_gbh_phys_t *gbh; | ||||
abd_t *gbh_abd; | abd_t *gbh_abd; | ||||
uint64_t txg = pio->io_txg; | uint64_t txg = pio->io_txg; | ||||
uint64_t resid = pio->io_size; | uint64_t resid = pio->io_size; | ||||
▲ Show 20 Lines • Show All 700 Lines • ▼ Show 20 Lines | zio_dva_allocate(zio_t *zio) | ||||
mc = zio->io_metaslab_class; | mc = zio->io_metaslab_class; | ||||
if (mc == NULL) { | if (mc == NULL) { | ||||
mc = spa_preferred_class(spa, zio->io_size, | mc = spa_preferred_class(spa, zio->io_size, | ||||
zio->io_prop.zp_type, zio->io_prop.zp_level, | zio->io_prop.zp_type, zio->io_prop.zp_level, | ||||
zio->io_prop.zp_zpl_smallblk); | zio->io_prop.zp_zpl_smallblk); | ||||
zio->io_metaslab_class = mc; | zio->io_metaslab_class = mc; | ||||
} | } | ||||
/* | |||||
* Try allocating the block in the usual metaslab class. | |||||
* If that's full, allocate it in the normal class. | |||||
* If that's full, allocate as a gang block, | |||||
* and if all are full, the allocation fails (which shouldn't happen). | |||||
* | |||||
* Note that we do not fall back on embedded slog (ZIL) space, to | |||||
* preserve unfragmented slog space, which is critical for decent | |||||
* sync write performance. If a log allocation fails, we will fall | |||||
* back to spa_sync() which is abysmal for performance. | |||||
*/ | |||||
error = metaslab_alloc(spa, mc, zio->io_size, bp, | error = metaslab_alloc(spa, mc, zio->io_size, bp, | ||||
zio->io_prop.zp_copies, zio->io_txg, NULL, flags, | zio->io_prop.zp_copies, zio->io_txg, NULL, flags, | ||||
&zio->io_alloc_list, zio, zio->io_allocator); | &zio->io_alloc_list, zio, zio->io_allocator); | ||||
/* | /* | ||||
* Fallback to normal class when an alloc class is full | * Fallback to normal class when an alloc class is full | ||||
*/ | */ | ||||
if (error == ENOSPC && mc != spa_normal_class(spa)) { | if (error == ENOSPC && mc != spa_normal_class(spa)) { | ||||
/* | /* | ||||
* If throttling, transfer reservation over to normal class. | * If throttling, transfer reservation over to normal class. | ||||
* The io_allocator slot can remain the same even though we | * The io_allocator slot can remain the same even though we | ||||
* are switching classes. | * are switching classes. | ||||
*/ | */ | ||||
if (mc->mc_alloc_throttle_enabled && | if (mc->mc_alloc_throttle_enabled && | ||||
(zio->io_flags & ZIO_FLAG_IO_ALLOCATING)) { | (zio->io_flags & ZIO_FLAG_IO_ALLOCATING)) { | ||||
metaslab_class_throttle_unreserve(mc, | metaslab_class_throttle_unreserve(mc, | ||||
zio->io_prop.zp_copies, zio->io_allocator, zio); | zio->io_prop.zp_copies, zio->io_allocator, zio); | ||||
zio->io_flags &= ~ZIO_FLAG_IO_ALLOCATING; | zio->io_flags &= ~ZIO_FLAG_IO_ALLOCATING; | ||||
mc = spa_normal_class(spa); | VERIFY(metaslab_class_throttle_reserve( | ||||
VERIFY(metaslab_class_throttle_reserve(mc, | spa_normal_class(spa), | ||||
zio->io_prop.zp_copies, zio->io_allocator, zio, | zio->io_prop.zp_copies, zio->io_allocator, zio, | ||||
flags | METASLAB_MUST_RESERVE)); | flags | METASLAB_MUST_RESERVE)); | ||||
} else { | |||||
mc = spa_normal_class(spa); | |||||
} | } | ||||
zio->io_metaslab_class = mc; | zio->io_metaslab_class = mc = spa_normal_class(spa); | ||||
if (zfs_flags & ZFS_DEBUG_METASLAB_ALLOC) { | |||||
zfs_dbgmsg("%s: metaslab allocation failure, " | |||||
"trying normal class: zio %px, size %llu, error %d", | |||||
spa_name(spa), zio, zio->io_size, error); | |||||
} | |||||
error = metaslab_alloc(spa, mc, zio->io_size, bp, | error = metaslab_alloc(spa, mc, zio->io_size, bp, | ||||
zio->io_prop.zp_copies, zio->io_txg, NULL, flags, | zio->io_prop.zp_copies, zio->io_txg, NULL, flags, | ||||
&zio->io_alloc_list, zio, zio->io_allocator); | &zio->io_alloc_list, zio, zio->io_allocator); | ||||
} | } | ||||
if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) { | |||||
if (zfs_flags & ZFS_DEBUG_METASLAB_ALLOC) { | |||||
zfs_dbgmsg("%s: metaslab allocation failure, " | |||||
"trying ganging: zio %px, size %llu, error %d", | |||||
spa_name(spa), zio, zio->io_size, error); | |||||
} | |||||
return (zio_write_gang_block(zio, mc)); | |||||
} | |||||
if (error != 0) { | if (error != 0) { | ||||
if (error != ENOSPC || | |||||
(zfs_flags & ZFS_DEBUG_METASLAB_ALLOC)) { | |||||
zfs_dbgmsg("%s: metaslab allocation failure: zio %px, " | zfs_dbgmsg("%s: metaslab allocation failure: zio %px, " | ||||
"size %llu, error %d", spa_name(spa), zio, zio->io_size, | "size %llu, error %d", | ||||
error); | spa_name(spa), zio, zio->io_size, error); | ||||
if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) | } | ||||
return (zio_write_gang_block(zio)); | |||||
zio->io_error = error; | zio->io_error = error; | ||||
} | } | ||||
return (zio); | return (zio); | ||||
} | } | ||||
static zio_t * | static zio_t * | ||||
zio_dva_free(zio_t *zio) | zio_dva_free(zio_t *zio) | ||||
▲ Show 20 Lines • Show All 63 Lines • ▼ Show 20 Lines | zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp, | ||||
* When allocating a zil block, we don't have information about | * When allocating a zil block, we don't have information about | ||||
* the final destination of the block except the objset it's part | * the final destination of the block except the objset it's part | ||||
* of, so we just hash the objset ID to pick the allocator to get | * of, so we just hash the objset ID to pick the allocator to get | ||||
* some parallelism. | * some parallelism. | ||||
*/ | */ | ||||
int flags = METASLAB_FASTWRITE | METASLAB_ZIL; | int flags = METASLAB_FASTWRITE | METASLAB_ZIL; | ||||
int allocator = cityhash4(0, 0, 0, os->os_dsl_dataset->ds_object) % | int allocator = cityhash4(0, 0, 0, os->os_dsl_dataset->ds_object) % | ||||
spa->spa_alloc_count; | spa->spa_alloc_count; | ||||
error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, | error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1, | ||||
1, txg, NULL, flags, &io_alloc_list, NULL, allocator); | txg, NULL, flags, &io_alloc_list, NULL, allocator); | ||||
if (error == 0) { | *slog = (error == 0); | ||||
*slog = TRUE; | if (error != 0) { | ||||
} else { | error = metaslab_alloc(spa, spa_embedded_log_class(spa), size, | ||||
error = metaslab_alloc(spa, spa_normal_class(spa), size, new_bp, | new_bp, 1, txg, NULL, flags, | ||||
1, txg, NULL, flags, &io_alloc_list, NULL, allocator); | &io_alloc_list, NULL, allocator); | ||||
if (error == 0) | } | ||||
*slog = FALSE; | if (error != 0) { | ||||
error = metaslab_alloc(spa, spa_normal_class(spa), size, | |||||
new_bp, 1, txg, NULL, flags, | |||||
&io_alloc_list, NULL, allocator); | |||||
} | } | ||||
metaslab_trace_fini(&io_alloc_list); | metaslab_trace_fini(&io_alloc_list); | ||||
if (error == 0) { | if (error == 0) { | ||||
BP_SET_LSIZE(new_bp, size); | BP_SET_LSIZE(new_bp, size); | ||||
BP_SET_PSIZE(new_bp, size); | BP_SET_PSIZE(new_bp, size); | ||||
BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF); | BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF); | ||||
BP_SET_CHECKSUM(new_bp, | BP_SET_CHECKSUM(new_bp, | ||||
▲ Show 20 Lines • Show All 1,376 Lines • Show Last 20 Lines |