Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/abd.c =================================================================== --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/abd.c +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/abd.c @@ -758,6 +758,9 @@ int ret = 0; struct abd_iter aiter; + if (size == 0) + return (ret); + abd_verify(abd); ASSERT3U(off + size, <=, abd->abd_size); @@ -886,6 +889,9 @@ int ret = 0; struct abd_iter daiter, saiter; + if (size == 0) + return (ret); + abd_verify(dabd); abd_verify(sabd); Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c =================================================================== --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c @@ -91,7 +91,7 @@ { reference_t *ref; - ASSERT(rc->rc_count == number); + ASSERT3U(rc->rc_count, ==, number); while (ref = list_head(&rc->rc_list)) { list_remove(&rc->rc_list, ref); kmem_cache_free(reference_cache, ref); Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c =================================================================== --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c @@ -55,6 +55,7 @@ #include #include #include +#include #include #include #include @@ -5923,8 +5924,9 @@ vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; vdev_ops_t *pvops; char *oldvdpath, *newvdpath; - int newvd_isspare; + int newvd_isspare = B_FALSE; int error; + boolean_t raidz = B_FALSE; ASSERT(spa_writeable(spa)); @@ -5947,10 +5949,16 @@ if (oldvd == NULL) return (spa_vdev_exit(spa, NULL, txg, ENODEV)); - if (!oldvd->vdev_ops->vdev_op_leaf) + if (oldvd->vdev_ops == &vdev_raidz_ops) { + raidz = B_TRUE; + } else if (!oldvd->vdev_ops->vdev_op_leaf) { return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); + } - pvd = oldvd->vdev_parent; + if (raidz) + pvd = oldvd; + else + pvd = oldvd->vdev_parent; if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, VDEV_ALLOC_ATTACH)) != 0) @@ -5979,6 +5987,7 @@ * vdev. */ if (pvd->vdev_ops != &vdev_mirror_ops && + pvd->vdev_ops != &vdev_raidz_ops && pvd->vdev_ops != &vdev_root_ops) return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); @@ -6018,7 +6027,8 @@ /* * Make sure the new device is big enough. */ - if (newvd->vdev_asize < vdev_get_min_asize(oldvd)) + vdev_t *min_vdev = raidz ? oldvd->vdev_child[0] : oldvd; + if (newvd->vdev_asize < vdev_get_min_asize(min_vdev)) return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); /* @@ -6028,35 +6038,48 @@ if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); + if (raidz) { + oldvdpath = kmem_asprintf("raidz%u-%u", + oldvd->vdev_nparity, oldvd->vdev_id); + } else { + oldvdpath = spa_strdup(oldvd->vdev_path); + } + newvdpath = spa_strdup(newvd->vdev_path); + /* * If this is an in-place replacement, update oldvd's path and devid * to make it distinguishable from newvd, and unopenable from now on. */ - if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { + if (strcmp(oldvdpath, newvdpath) == 0) { spa_strfree(oldvd->vdev_path); - oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, + oldvd->vdev_path = kmem_alloc(strlen(newvdpath) + 5, KM_SLEEP); - (void) sprintf(oldvd->vdev_path, "%s/%s", - newvd->vdev_path, "old"); + (void) sprintf(oldvd->vdev_path, "%s/old", + newvdpath); if (oldvd->vdev_devid != NULL) { spa_strfree(oldvd->vdev_devid); oldvd->vdev_devid = NULL; } + spa_strfree(oldvdpath); + oldvdpath = spa_strdup(oldvd->vdev_path); } /* mark the device being resilvered */ - newvd->vdev_resilver_txg = txg; + if (!raidz) + newvd->vdev_resilver_txg = txg; /* * If the parent is not a mirror, or if we're replacing, insert the new * mirror/replacing/spare vdev above oldvd. */ - if (pvd->vdev_ops != pvops) + if (!raidz && pvd->vdev_ops != pvops) pvd = vdev_add_parent(oldvd, pvops); ASSERT(pvd->vdev_top->vdev_parent == rvd); +#if 0 ASSERT(pvd->vdev_ops == pvops); ASSERT(oldvd->vdev_parent == pvd); +#endif /* * Extract the new device from its root and add it to pvd. @@ -6079,29 +6102,34 @@ */ dtl_max_txg = txg + TXG_CONCURRENT_STATES; - vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL, - dtl_max_txg - TXG_INITIAL); + if (raidz) { + dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); + dsl_sync_task_nowait(spa->spa_dsl_pool, vdev_raidz_attach_sync, + newvd, 0, ZFS_SPACE_CHECK_EXTRA_RESERVED, tx); + dmu_tx_commit(tx); + } else { + vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL, + dtl_max_txg - TXG_INITIAL); - if (newvd->vdev_isspare) { - spa_spare_activate(newvd); - spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_SPARE); - } + if (newvd->vdev_isspare) { + spa_spare_activate(newvd); + spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_SPARE); + } - oldvdpath = spa_strdup(oldvd->vdev_path); - newvdpath = spa_strdup(newvd->vdev_path); - newvd_isspare = newvd->vdev_isspare; + newvd_isspare = newvd->vdev_isspare; - /* - * Mark newvd's DTL dirty in this txg. - */ - vdev_dirty(tvd, VDD_DTL, newvd, txg); + /* + * Mark newvd's DTL dirty in this txg. + */ + vdev_dirty(tvd, VDD_DTL, newvd, txg); - /* - * Schedule the resilver to restart in the future. We do this to - * ensure that dmu_sync-ed blocks have been stitched into the - * respective datasets. - */ - dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg); + /* + * Schedule the resilver to restart in the future. We do this to + * ensure that dmu_sync-ed blocks have been stitched into the + * respective datasets. + */ + dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg); + } if (spa->spa_bootfs) spa_event_notify(spa, newvd, NULL, ESC_ZFS_BOOTFS_VDEV_ATTACH); @@ -6113,6 +6141,10 @@ */ (void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0); + if (raidz) { + error = vdev_online(spa, guid, ZFS_ONLINE_EXPAND, NULL); + } + spa_history_log_internal(spa, "vdev attach", NULL, "%s vdev=%s %s vdev=%s", replacing && newvd_isspare ? "spare in" : Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_raidz.h =================================================================== --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_raidz.h +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_raidz.h @@ -39,10 +39,18 @@ extern "C" { #endif -#ifdef _KERNEL +typedef struct vdev_raidz { + int vd_logical_width; + int vd_physical_width; + int vd_nparity; + boolean_t vn_expanding; +} vdev_raidz_t; + extern int vdev_raidz_physio(vdev_t *, caddr_t, size_t, uint64_t, uint64_t, boolean_t, boolean_t); -#endif +extern void vdev_raidz_attach_sync(void *, dmu_tx_t *); +extern void vdev_raidz_config_generate(vdev_t *, nvlist_t *); +extern void *vdev_raidz_get_tsd(spa_t *, nvlist_t *); #ifdef __cplusplus } #endif Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c =================================================================== --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c @@ -49,6 +49,7 @@ #include #include #include +#include #include #include @@ -584,7 +585,7 @@ { vdev_ops_t *ops; char *type; - uint64_t guid = 0, islog, nparity; + uint64_t guid = 0, islog; vdev_t *vd; vdev_indirect_config_t *vic; @@ -637,47 +638,21 @@ if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES) return (SET_ERROR(ENOTSUP)); - /* - * Set the nparity property for RAID-Z vdevs. - */ - nparity = -1ULL; + void *tsd = NULL; + int nparity = 0; if (ops == &vdev_raidz_ops) { - if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, - &nparity) == 0) { - if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY) - return (SET_ERROR(EINVAL)); - /* - * Previous versions could only support 1 or 2 parity - * device. - */ - if (nparity > 1 && - spa_version(spa) < SPA_VERSION_RAIDZ2) - return (SET_ERROR(ENOTSUP)); - if (nparity > 2 && - spa_version(spa) < SPA_VERSION_RAIDZ3) - return (SET_ERROR(ENOTSUP)); - } else { - /* - * We require the parity to be specified for SPAs that - * support multiple parity levels. - */ - if (spa_version(spa) >= SPA_VERSION_RAIDZ2) - return (SET_ERROR(EINVAL)); - /* - * Otherwise, we default to 1 parity device for RAID-Z. - */ - nparity = 1; - } - } else { - nparity = 0; + vdev_raidz_t *rz = tsd = vdev_raidz_get_tsd(spa, nv); + if (rz == NULL) + return (SET_ERROR(EINVAL)); + nparity = rz->vd_nparity; } - ASSERT(nparity != -1ULL); vd = vdev_alloc_common(spa, id, guid, ops); vic = &vd->vdev_indirect_config; vd->vdev_islog = islog; vd->vdev_nparity = nparity; + vd->vdev_tsd = tsd; if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0) vd->vdev_path = spa_strdup(vd->vdev_path); @@ -849,6 +824,11 @@ ASSERT(vd->vdev_child == NULL); ASSERT(vd->vdev_guid_sum == vd->vdev_guid); + if (vd->vdev_ops == &vdev_raidz_ops) { + vdev_raidz_t *rz = vd->vdev_tsd; + kmem_free(rz, sizeof(*rz)); + } + /* * Discard allocation state. */ @@ -3155,8 +3135,10 @@ if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, ENODEV)); +#if 0 if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); +#endif wasoffline = (vd->vdev_offline || vd->vdev_tmpoffline); oldstate = vd->vdev_state; Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect.c =================================================================== --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect.c +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect.c @@ -1078,6 +1078,13 @@ if (vd->vdev_ops == &vdev_indirect_ops) return; + printf("vdev_indirect_io_start_cb: src=%llx split_offset=%x dst: vd=%u off=%llx size=%x\n", + (long long)zio->io_offset, + (int)split_offset, + (int)vd->vdev_id, + (long long)offset, + (int)size); + zio_nowait(zio_vdev_child_io(zio, NULL, vd, offset, abd_get_offset(zio->io_abd, split_offset), size, zio->io_type, zio->io_priority, Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c =================================================================== --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c @@ -141,6 +141,7 @@ #include #include #include +#include #include #include #include @@ -276,31 +277,13 @@ if (vd->vdev_fru != NULL) fnvlist_add_string(nv, ZPOOL_CONFIG_FRU, vd->vdev_fru); - if (vd->vdev_nparity != 0) { - ASSERT(strcmp(vd->vdev_ops->vdev_op_type, - VDEV_TYPE_RAIDZ) == 0); + if (vd->vdev_ops == &vdev_raidz_ops) + vdev_raidz_config_generate(vd, nv); - /* - * Make sure someone hasn't managed to sneak a fancy new vdev - * into a crufty old storage pool. - */ - ASSERT(vd->vdev_nparity == 1 || - (vd->vdev_nparity <= 2 && - spa_version(spa) >= SPA_VERSION_RAIDZ2) || - (vd->vdev_nparity <= 3 && - spa_version(spa) >= SPA_VERSION_RAIDZ3)); - - /* - * Note that we'll add the nparity tag even on storage pools - * that only support a single parity device -- older software - * will just ignore it. - */ - fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vd->vdev_nparity); - } - - if (vd->vdev_wholedisk != -1ULL) + if (vd->vdev_wholedisk != -1ULL) { fnvlist_add_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, vd->vdev_wholedisk); + } if (vd->vdev_not_present && !(flags & VDEV_CONFIG_MISSING)) fnvlist_add_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 1); Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c =================================================================== --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c @@ -28,12 +28,14 @@ #include #include +#include #include #ifdef illumos #include #endif #include #include +#include #include #include #include @@ -41,6 +43,12 @@ #include #include +#if 0 +#ifdef ZFS_DEBUG +#include /* vdev_xlate testing */ +#endif +#endif + /* * Virtual device vector for RAID-Z. * @@ -113,27 +121,31 @@ uint64_t rc_offset; /* device offset */ uint64_t rc_size; /* I/O size */ abd_t *rc_abd; /* I/O data */ + void *rc_orig_data; /* pre-reconstruction */ void *rc_gdata; /* used to store the "good" version */ int rc_error; /* I/O error for this device */ uint8_t rc_tried; /* Did we attempt this I/O column? */ uint8_t rc_skipped; /* Did we skip this I/O column? */ + uint8_t rc_need_orig_restore; /* need to restore from orig_data? */ } raidz_col_t; +typedef struct raidz_row { + uint64_t rr_cols; /* Regular column count */ + uint64_t rr_missingdata; /* Count of missing data devices */ + uint64_t rr_missingparity; /* Count of missing parity devices */ + uint64_t rr_firstdatacol; /* First data column/parity count */ + abd_t *rr_abd_copy; /* rm_asize-buffer of copied data */ + int rr_code; /* reconstruction code */ + raidz_col_t rr_col[0]; /* Flexible array of I/O columns */ +} raidz_row_t; + typedef struct raidz_map { - uint64_t rm_cols; /* Regular column count */ - uint64_t rm_scols; /* Count including skipped columns */ - uint64_t rm_bigcols; /* Number of oversized columns */ - uint64_t rm_asize; /* Actual total I/O size */ - uint64_t rm_missingdata; /* Count of missing data devices */ - uint64_t rm_missingparity; /* Count of missing parity devices */ - uint64_t rm_firstdatacol; /* First data column/parity count */ - uint64_t rm_nskip; /* Skipped sectors for padding */ - uint64_t rm_skipstart; /* Column index of padding start */ - abd_t *rm_abd_copy; /* rm_asize-buffer of copied data */ uintptr_t rm_reports; /* # of referencing checksum reports */ - uint8_t rm_freed; /* map no longer has referencing ZIO */ - uint8_t rm_ecksuminjected; /* checksum error was injected */ - raidz_col_t rm_col[1]; /* Flexible array of I/O columns */ + boolean_t rm_freed; /* map no longer has referencing ZIO */ + boolean_t rm_ecksuminjected; /* checksum error was injected */ + int rm_nrows; + int rm_nskip; /* Sectors skipped for padding */ + raidz_row_t *rm_row[0]; /* flexible array of rows */ } raidz_map_t; #define VDEV_RAIDZ_P 0 @@ -241,7 +253,7 @@ 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf, }; -static void vdev_raidz_generate_parity(raidz_map_t *rm); +static void vdev_raidz_generate_parity(raidz_row_t *); /* * Multiply a given number by 2 raised to the given power. @@ -263,31 +275,46 @@ } static void -vdev_raidz_map_free(raidz_map_t *rm) +vdev_raidz_row_free(raidz_row_t *rr) { int c; - size_t size; - for (c = 0; c < rm->rm_firstdatacol; c++) { - if (rm->rm_col[c].rc_abd != NULL) - abd_free(rm->rm_col[c].rc_abd); + for (c = 0; c < rr->rr_firstdatacol && c < rr->rr_cols; c++) { + if (rr->rr_col[c].rc_abd != NULL) + abd_free(rr->rr_col[c].rc_abd); - if (rm->rm_col[c].rc_gdata != NULL) - zio_buf_free(rm->rm_col[c].rc_gdata, - rm->rm_col[c].rc_size); + if (rr->rr_col[c].rc_gdata != NULL) { + zio_buf_free(rr->rr_col[c].rc_gdata, + rr->rr_col[c].rc_size); + } + if (rr->rr_col[c].rc_orig_data != NULL) { + zio_buf_free(rr->rr_col[c].rc_orig_data, + rr->rr_col[c].rc_size); + } } - size = 0; - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - if (rm->rm_col[c].rc_abd != NULL) - abd_put(rm->rm_col[c].rc_abd); - size += rm->rm_col[c].rc_size; + for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { + if (rr->rr_col[c].rc_abd != NULL) + abd_put(rr->rr_col[c].rc_abd); + if (rr->rr_col[c].rc_orig_data != NULL) { + zio_buf_free(rr->rr_col[c].rc_orig_data, + rr->rr_col[c].rc_size); + } } - if (rm->rm_abd_copy != NULL) - abd_free(rm->rm_abd_copy); + if (rr->rr_abd_copy != NULL) + abd_free(rr->rr_abd_copy); - kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols])); + kmem_free(rr, offsetof(raidz_row_t, rr_col[rr->rr_cols])); +} + +static void +vdev_raidz_map_free(raidz_map_t *rm) +{ + for (int i = 0; i < rm->rm_nrows; i++) { + vdev_raidz_row_free(rm->rm_row[i]); + } + kmem_free(rm, offsetof(raidz_map_t, rm_row[rm->rm_nrows])); } static void @@ -296,10 +323,11 @@ raidz_map_t *rm = zio->io_vsd; ASSERT0(rm->rm_freed); - rm->rm_freed = 1; + rm->rm_freed = B_TRUE; - if (rm->rm_reports == 0) + if (rm->rm_reports == 0) { vdev_raidz_map_free(rm); + } } /*ARGSUSED*/ @@ -310,7 +338,7 @@ ASSERT3U(rm->rm_reports, >, 0); - if (--rm->rm_reports == 0 && rm->rm_freed != 0) + if (--rm->rm_reports == 0 && rm->rm_freed) vdev_raidz_map_free(rm); } @@ -324,18 +352,22 @@ const char *good = NULL; char *bad; + zfs_dbgmsg("checksum error on rm=%p", rm); + if (good_data == NULL) { zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE); return; } - if (c < rm->rm_firstdatacol) { + zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE); +#if 0 + if (c < rm->rr_firstdatacol) { /* * The first time through, calculate the parity blocks for * the good data (this relies on the fact that the good * data never changes for a given logical ZIO) */ - if (rm->rm_col[0].rc_gdata == NULL) { + if (rm->rr_col[0].rc_gdata == NULL) { abd_t *bad_parity[VDEV_RAIDZ_MAXPARITY]; char *buf; int offset; @@ -345,22 +377,22 @@ * good_data, first saving the parity bufs and * replacing them with buffers to hold the result. */ - for (x = 0; x < rm->rm_firstdatacol; x++) { - bad_parity[x] = rm->rm_col[x].rc_abd; - rm->rm_col[x].rc_gdata = - zio_buf_alloc(rm->rm_col[x].rc_size); - rm->rm_col[x].rc_abd = - abd_get_from_buf(rm->rm_col[x].rc_gdata, - rm->rm_col[x].rc_size); + for (x = 0; x < rm->rr_firstdatacol; x++) { + bad_parity[x] = rm->rr_col[x].rc_abd; + rm->rr_col[x].rc_gdata = + zio_buf_alloc(rm->rr_col[x].rc_size); + rm->rr_col[x].rc_abd = + abd_get_from_buf(rm->rr_col[x].rc_gdata, + rm->rr_col[x].rc_size); } /* fill in the data columns from good_data */ buf = (char *)good_data; - for (; x < rm->rm_cols; x++) { - abd_put(rm->rm_col[x].rc_abd); - rm->rm_col[x].rc_abd = abd_get_from_buf(buf, - rm->rm_col[x].rc_size); - buf += rm->rm_col[x].rc_size; + for (; x < rm->rr_cols; x++) { + abd_put(rm->rr_col[x].rc_abd); + rm->rr_col[x].rc_abd = abd_get_from_buf(buf, + rm->rr_col[x].rc_size); + buf += rm->rr_col[x].rc_size; } /* @@ -369,34 +401,35 @@ vdev_raidz_generate_parity(rm); /* restore everything back to its original state */ - for (x = 0; x < rm->rm_firstdatacol; x++) { - abd_put(rm->rm_col[x].rc_abd); - rm->rm_col[x].rc_abd = bad_parity[x]; + for (x = 0; x < rm->rr_firstdatacol; x++) { + abd_put(rm->rr_col[x].rc_abd); + rm->rr_col[x].rc_abd = bad_parity[x]; } offset = 0; - for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) { - abd_put(rm->rm_col[x].rc_abd); - rm->rm_col[x].rc_abd = abd_get_offset( - rm->rm_abd_copy, offset); - offset += rm->rm_col[x].rc_size; + for (x = rm->rr_firstdatacol; x < rm->rr_cols; x++) { + abd_put(rm->rr_col[x].rc_abd); + rm->rr_col[x].rc_abd = abd_get_offset( + rm->rr_abd_copy, offset); + offset += rm->rr_col[x].rc_size; } } - ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL); - good = rm->rm_col[c].rc_gdata; + ASSERT3P(rm->rr_col[c].rc_gdata, !=, NULL); + good = rm->rr_col[c].rc_gdata; } else { /* adjust good_data to point at the start of our column */ good = good_data; - for (x = rm->rm_firstdatacol; x < c; x++) - good += rm->rm_col[x].rc_size; + for (x = rm->rr_firstdatacol; x < c; x++) + good += rm->rr_col[x].rc_size; } - bad = abd_borrow_buf_copy(rm->rm_col[c].rc_abd, rm->rm_col[c].rc_size); + bad = abd_borrow_buf_copy(rm->rr_col[c].rc_abd, rm->rr_col[c].rc_size); /* we drop the ereport if it ends up that the data was good */ zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE); - abd_return_buf(rm->rm_col[c].rc_abd, bad, rm->rm_col[c].rc_size); + abd_return_buf(rm->rr_col[c].rc_abd, bad, rm->rr_col[c].rc_size); +#endif } /* @@ -409,10 +442,7 @@ vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg) { size_t c = (size_t)(uintptr_t)arg; - size_t offset; - raidz_map_t *rm = zio->io_vsd; - size_t size; /* set up the report and bump the refcount */ zcr->zcr_cbdata = rm; @@ -423,7 +453,7 @@ rm->rm_reports++; ASSERT3U(rm->rm_reports, >, 0); - if (rm->rm_abd_copy != NULL) + if (rm->rm_row[0]->rr_abd_copy != NULL) return; /* @@ -435,24 +465,33 @@ * to copy them. */ - size = 0; - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) - size += rm->rm_col[c].rc_size; + for (int i = 0; i < rm->rm_nrows; i++) { + raidz_row_t *rr = rm->rm_row[i]; + size_t offset; + size_t size = 0; + for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) + size += rr->rr_col[c].rc_size; + + rr->rr_abd_copy = + abd_alloc_sametype(rr->rr_col[rr->rr_firstdatacol].rc_abd, + size); - rm->rm_abd_copy = - abd_alloc_sametype(rm->rm_col[rm->rm_firstdatacol].rc_abd, size); + for (offset = 0, c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { + raidz_col_t *col = &rr->rr_col[c]; + + if (col->rc_size == 0) + continue; - for (offset = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - raidz_col_t *col = &rm->rm_col[c]; - abd_t *tmp = abd_get_offset(rm->rm_abd_copy, offset); + abd_t *tmp = abd_get_offset(rr->rr_abd_copy, offset); - abd_copy(tmp, col->rc_abd, col->rc_size); - abd_put(col->rc_abd); - col->rc_abd = tmp; + abd_copy(tmp, col->rc_abd, col->rc_size); + abd_put(col->rc_abd); + col->rc_abd = tmp; - offset += col->rc_size; + offset += col->rc_size; + } + ASSERT3U(offset, ==, size); } - ASSERT3U(offset, ==, size); } static const zio_vsd_ops_t vdev_raidz_vsd_ops = { @@ -468,7 +507,7 @@ vdev_raidz_map_alloc(abd_t *abd, uint64_t size, uint64_t offset, boolean_t dofree, uint64_t unit_shift, uint64_t dcols, uint64_t nparity) { - raidz_map_t *rm; + raidz_row_t *rr; /* The starting RAIDZ (parent) vdev sector of the block. */ uint64_t b = offset >> unit_shift; /* The zio's size in units of the vdev's minimum sector size. */ @@ -477,9 +516,13 @@ uint64_t f = b % dcols; /* The starting byte offset on each child vdev. */ uint64_t o = (b / dcols) << unit_shift; - uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; + uint64_t q, r, c, bc, col, acols, coff, devidx, asize, tot; uint64_t off = 0; + raidz_map_t *rm = + kmem_zalloc(offsetof(raidz_map_t, rm_row[1]), KM_SLEEP); + rm->rm_nrows = 1; + /* * "Quotient": The number of data sectors for this stripe on all but * the "big column" child vdevs that also contain "remainder" data. @@ -502,77 +545,63 @@ tot = s + nparity * (q + (r == 0 ? 0 : 1)); /* acols: The columns that will be accessed. */ - /* scols: The columns that will be accessed or skipped. */ if (q == 0) { /* Our I/O request doesn't span all child vdevs. */ acols = bc; - scols = MIN(dcols, roundup(bc, nparity + 1)); } else { acols = dcols; - scols = dcols; } - ASSERT3U(acols, <=, scols); - - rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP); + rr = kmem_alloc(offsetof(raidz_row_t, rr_col[acols]), KM_SLEEP); + rm->rm_row[0] = rr; - rm->rm_cols = acols; - rm->rm_scols = scols; - rm->rm_bigcols = bc; - rm->rm_skipstart = bc; - rm->rm_missingdata = 0; - rm->rm_missingparity = 0; - rm->rm_firstdatacol = nparity; - rm->rm_abd_copy = NULL; - rm->rm_reports = 0; - rm->rm_freed = 0; - rm->rm_ecksuminjected = 0; + rr->rr_cols = acols; + rr->rr_missingdata = 0; + rr->rr_missingparity = 0; + rr->rr_firstdatacol = nparity; + rr->rr_abd_copy = NULL; asize = 0; - for (c = 0; c < scols; c++) { + for (c = 0; c < acols; c++) { col = f + c; coff = o; if (col >= dcols) { col -= dcols; coff += 1ULL << unit_shift; } - rm->rm_col[c].rc_devidx = col; - rm->rm_col[c].rc_offset = coff; - rm->rm_col[c].rc_abd = NULL; - rm->rm_col[c].rc_gdata = NULL; - rm->rm_col[c].rc_error = 0; - rm->rm_col[c].rc_tried = 0; - rm->rm_col[c].rc_skipped = 0; - - if (c >= acols) - rm->rm_col[c].rc_size = 0; - else if (c < bc) - rm->rm_col[c].rc_size = (q + 1) << unit_shift; + rr->rr_col[c].rc_devidx = col; + rr->rr_col[c].rc_offset = coff; + rr->rr_col[c].rc_abd = NULL; + rr->rr_col[c].rc_gdata = NULL; + rr->rr_col[c].rc_orig_data = NULL; + rr->rr_col[c].rc_error = 0; + rr->rr_col[c].rc_tried = 0; + rr->rr_col[c].rc_skipped = 0; + rr->rr_col[c].rc_need_orig_restore = B_FALSE; + + if (c < bc) + rr->rr_col[c].rc_size = (q + 1) << unit_shift; else - rm->rm_col[c].rc_size = q << unit_shift; + rr->rr_col[c].rc_size = q << unit_shift; - asize += rm->rm_col[c].rc_size; + asize += rr->rr_col[c].rc_size; } ASSERT3U(asize, ==, tot << unit_shift); - rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift); rm->rm_nskip = roundup(tot, nparity + 1) - tot; - ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift); - ASSERT3U(rm->rm_nskip, <=, nparity); if (!dofree) { - for (c = 0; c < rm->rm_firstdatacol; c++) { - rm->rm_col[c].rc_abd = - abd_alloc_linear(rm->rm_col[c].rc_size, B_TRUE); - } + for (c = 0; c < rr->rr_firstdatacol; c++) + rr->rr_col[c].rc_abd = + abd_alloc_linear(rr->rr_col[c].rc_size, B_TRUE); - rm->rm_col[c].rc_abd = abd_get_offset(abd, 0); - off = rm->rm_col[c].rc_size; + rr->rr_col[c].rc_abd = abd_get_offset(abd, 0); + off = rr->rr_col[c].rc_size; for (c = c + 1; c < acols; c++) { - rm->rm_col[c].rc_abd = abd_get_offset(abd, off); - off += rm->rm_col[c].rc_size; + rr->rr_col[c].rc_abd = abd_get_offset(abd, off); + off += rr->rr_col[c].rc_size; } } @@ -596,20 +625,182 @@ * skip the first column since at least one data and one parity * column must appear in each row. */ - ASSERT(rm->rm_cols >= 2); - ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size); + ASSERT(rr->rr_cols >= 2); + ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size); + + if (rr->rr_firstdatacol == 1 && (offset & (1ULL << 20))) { + devidx = rr->rr_col[0].rc_devidx; + o = rr->rr_col[0].rc_offset; + rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx; + rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset; + rr->rr_col[1].rc_devidx = devidx; + rr->rr_col[1].rc_offset = o; + } + + return (rm); +} + +static raidz_map_t * +vdev_raidz_map_alloc_expanded(abd_t *abd, uint64_t size, uint64_t offset, boolean_t dofree, + uint64_t ashift, uint64_t physical_cols, uint64_t logical_cols, + uint64_t nparity) +{ + /* The starting RAIDZ (parent) vdev sector of the block. */ + uint64_t b = offset >> ashift; + /* The zio's size in units of the vdev's minimum sector size. */ + uint64_t s = size >> ashift; + uint64_t cur_col = b % physical_cols; + /* The starting byte offset on each child vdev. */ + uint64_t child_offset = (b / physical_cols) << ashift; + uint64_t q, r, bc, devidx, asize, tot; + + /* + * "Quotient": The number of data sectors for this stripe on all but + * the "big column" child vdevs that also contain "remainder" data. + * AKA "full rows" + */ + q = s / (logical_cols - nparity); + + /* + * "Remainder": The number of partial stripe data sectors in this I/O. + * This will add a sector to some, but not all, child vdevs. + */ + r = s - q * (logical_cols - nparity); + + /* The number of "big columns" - those which contain remainder data. */ + bc = (r == 0 ? 0 : r + nparity); + + /* + * The total number of data and parity sectors associated with + * this I/O. + */ + tot = s + nparity * (q + (r == 0 ? 0 : 1)); + + /* How many rows contain data (not skip) */ + uint64_t rows = howmany(tot, logical_cols); + int cols = MIN(tot, logical_cols); + + raidz_map_t *rm = + kmem_zalloc(offsetof(raidz_map_t, rm_row[rows]), + KM_SLEEP); + rm->rm_nrows = rows; + rm->rm_nskip = roundup(tot, nparity + 1) - tot; + asize = 0; - if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) { - devidx = rm->rm_col[0].rc_devidx; - o = rm->rm_col[0].rc_offset; - rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx; - rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset; - rm->rm_col[1].rc_devidx = devidx; - rm->rm_col[1].rc_offset = o; + zfs_dbgmsg("rm=%p s=%d q=%d r=%d bc=%d nrows=%d cols=%d", + rm, (int)s, (int)q, (int)r, (int)bc, (int)rows, (int)cols); + + for (uint64_t row = 0; row < rows; row++) { + raidz_row_t *rr = kmem_alloc(offsetof(raidz_row_t, + rr_col[cols]), KM_SLEEP); + rm->rm_row[row] = rr; + + /* + * We set cols to the entire width of the block, even + * if this row is shorter. This is needed because parity + * generation (for Q and R) needs to know the entire width, + * because it treats the short row as though it was + * full-width (and the "phantom" sectors were zero-filled). + * + * Another approach to this would be to set cols shorter + * (to just the number of columns that we might do i/o to) + * and have another mechanism to tell the parity generation + * about the "entire width". Reconstruction (at least + * vdev_raidz_reconstruct_general()) would also need to + * know about the "entire width". + */ + rr->rr_cols = cols; + rr->rr_missingdata = 0; + rr->rr_missingparity = 0; + rr->rr_firstdatacol = nparity; + rr->rr_abd_copy = NULL; + + for (int c = 0; c < rr->rr_cols; c++, cur_col++) { + if (cur_col >= physical_cols) { + cur_col -= physical_cols; + child_offset += 1ULL << ashift; + } + rr->rr_col[c].rc_devidx = cur_col; + rr->rr_col[c].rc_offset = child_offset; + rr->rr_col[c].rc_gdata = NULL; + rr->rr_col[c].rc_orig_data = NULL; + rr->rr_col[c].rc_error = 0; + rr->rr_col[c].rc_tried = 0; + rr->rr_col[c].rc_skipped = 0; + rr->rr_col[c].rc_abd = NULL; + rr->rr_col[c].rc_need_orig_restore = B_FALSE; + + uint64_t dc = c - rr->rr_firstdatacol; + if (c < rr->rr_firstdatacol) { + rr->rr_col[c].rc_size = 1ULL << ashift; + if (!dofree) { + rr->rr_col[c].rc_abd = + abd_alloc_linear(rr->rr_col[c].rc_size, + B_TRUE); + } + } else if (row == rows - 1 && bc != 0 && c >= bc) { + /* + * Past the end, this for parity generation. + */ + rr->rr_col[c].rc_size = 0; + rr->rr_col[c].rc_abd = NULL; + } else { + /* XXX ASCII art diagram here */ + /* "data column" (col excluding parity) */ + uint64_t off; + + if (c < bc || r == 0) { + off = dc * rows + row; + } else { + off = r * rows + + (dc - r) * (rows - 1) + row; + } + zfs_dbgmsg("rm=%p row=%d c=%d dc=%d off=%u devidx=%u", + rm, (int)row, (int)c, (int)dc, (int)off, (int)cur_col); + rr->rr_col[c].rc_size = 1ULL << ashift; + if (!dofree) { + rr->rr_col[c].rc_abd = + abd_get_offset(abd, off << ashift); + } + } + + asize += rr->rr_col[c].rc_size; + } + + /* + * If all data stored spans all columns, there's a danger that parity + * will always be on the same device and, since parity isn't read + * during normal operation, that that device's I/O bandwidth won't be + * used effectively. We therefore switch the parity every 1MB. + * + * ... at least that was, ostensibly, the theory. As a practical + * matter unless we juggle the parity between all devices evenly, we + * won't see any benefit. Further, occasional writes that aren't a + * multiple of the LCM of the number of children and the minimum + * stripe width are sufficient to avoid pessimal behavior. + * Unfortunately, this decision created an implicit on-disk format + * requirement that we need to support for all eternity, but only + * for single-parity RAID-Z. + * + * If we intend to skip a sector in the zeroth column for padding + * we must make sure to note this swap. We will never intend to + * skip the first column since at least one data and one parity + * column must appear in each row. + */ + if (rr->rr_firstdatacol == 1 && rr->rr_cols > 1 && + (offset & (1ULL << 20))) { + ASSERT(rr->rr_cols >= 2); + ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size); + devidx = rr->rr_col[0].rc_devidx; + uint64_t o = rr->rr_col[0].rc_offset; + rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx; + rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset; + rr->rr_col[1].rc_devidx = devidx; + rr->rr_col[1].rc_offset = o; + } - if (rm->rm_skipstart == 0) - rm->rm_skipstart = 1; } + ASSERT3U(asize, ==, tot << ashift); return (rm); } @@ -676,55 +867,48 @@ } static void -vdev_raidz_generate_parity_p(raidz_map_t *rm) +vdev_raidz_generate_parity_p(raidz_row_t *rr) { - uint64_t *p; - int c; - abd_t *src; + uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - src = rm->rm_col[c].rc_abd; - p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd); + for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { + abd_t *src = rr->rr_col[c].rc_abd; - if (c == rm->rm_firstdatacol) { - abd_copy_to_buf(p, src, rm->rm_col[c].rc_size); + if (c == rr->rr_firstdatacol) { + abd_copy_to_buf(p, src, rr->rr_col[c].rc_size); } else { struct pqr_struct pqr = { p, NULL, NULL }; - (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size, + (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size, vdev_raidz_p_func, &pqr); } } } static void -vdev_raidz_generate_parity_pq(raidz_map_t *rm) +vdev_raidz_generate_parity_pq(raidz_row_t *rr) { - uint64_t *p, *q, pcnt, ccnt, mask, i; - int c; - abd_t *src; + uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); + uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); + uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); + ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size == + rr->rr_col[VDEV_RAIDZ_Q].rc_size); - pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); - ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == - rm->rm_col[VDEV_RAIDZ_Q].rc_size); + for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { + abd_t *src = rr->rr_col[c].rc_abd; - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - src = rm->rm_col[c].rc_abd; - p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd); - q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd); + uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]); - ccnt = rm->rm_col[c].rc_size / sizeof (p[0]); - - if (c == rm->rm_firstdatacol) { - abd_copy_to_buf(p, src, rm->rm_col[c].rc_size); - (void) memcpy(q, p, rm->rm_col[c].rc_size); + if (c == rr->rr_firstdatacol) { + abd_copy_to_buf(p, src, rr->rr_col[c].rc_size); + (void) memcpy(q, p, rr->rr_col[c].rc_size); } else { struct pqr_struct pqr = { p, q, NULL }; - (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size, + (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size, vdev_raidz_pq_func, &pqr); } - if (c == rm->rm_firstdatacol) { - for (i = ccnt; i < pcnt; i++) { + if (c == rr->rr_firstdatacol) { + for (uint64_t i = ccnt; i < pcnt; i++) { p[i] = 0; q[i] = 0; } @@ -733,7 +917,8 @@ * Treat short columns as though they are full of 0s. * Note that there's therefore nothing needed for P. */ - for (i = ccnt; i < pcnt; i++) { + uint64_t mask; + for (uint64_t i = ccnt; i < pcnt; i++) { VDEV_RAIDZ_64MUL_2(q[i], mask); } } @@ -741,38 +926,35 @@ } static void -vdev_raidz_generate_parity_pqr(raidz_map_t *rm) +vdev_raidz_generate_parity_pqr(raidz_row_t *rr) { - uint64_t *p, *q, *r, pcnt, ccnt, mask, i; - int c; - abd_t *src; - - pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); - ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == - rm->rm_col[VDEV_RAIDZ_Q].rc_size); - ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == - rm->rm_col[VDEV_RAIDZ_R].rc_size); - - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - src = rm->rm_col[c].rc_abd; - p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd); - q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd); - r = abd_to_buf(rm->rm_col[VDEV_RAIDZ_R].rc_abd); - - ccnt = rm->rm_col[c].rc_size / sizeof (p[0]); - - if (c == rm->rm_firstdatacol) { - abd_copy_to_buf(p, src, rm->rm_col[c].rc_size); - (void) memcpy(q, p, rm->rm_col[c].rc_size); - (void) memcpy(r, p, rm->rm_col[c].rc_size); + uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); + uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); + uint64_t *r = abd_to_buf(rr->rr_col[VDEV_RAIDZ_R].rc_abd); + uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); + ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size == + rr->rr_col[VDEV_RAIDZ_Q].rc_size); + ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size == + rr->rr_col[VDEV_RAIDZ_R].rc_size); + + for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { + abd_t *src = rr->rr_col[c].rc_abd; + + uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]); + + if (c == rr->rr_firstdatacol) { + abd_copy_to_buf(p, src, rr->rr_col[c].rc_size); + (void) memcpy(q, p, rr->rr_col[c].rc_size); + (void) memcpy(r, p, rr->rr_col[c].rc_size); } else { struct pqr_struct pqr = { p, q, r }; - (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size, + (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size, vdev_raidz_pqr_func, &pqr); } - if (c == rm->rm_firstdatacol) { - for (i = ccnt; i < pcnt; i++) { + if (c == rr->rr_firstdatacol) { + for (uint64_t i = ccnt; i < pcnt; i++) { + /* XXX does this really happen? firstdatacol should be the same size as the parity cols */ p[i] = 0; q[i] = 0; r[i] = 0; @@ -782,7 +964,8 @@ * Treat short columns as though they are full of 0s. * Note that there's therefore nothing needed for P. */ - for (i = ccnt; i < pcnt; i++) { + uint64_t mask; + for (uint64_t i = ccnt; i < pcnt; i++) { VDEV_RAIDZ_64MUL_2(q[i], mask); VDEV_RAIDZ_64MUL_4(r[i], mask); } @@ -795,17 +978,27 @@ * parity columns available. */ static void -vdev_raidz_generate_parity(raidz_map_t *rm) +vdev_raidz_generate_parity(raidz_row_t *rr) { - switch (rm->rm_firstdatacol) { + if (rr->rr_cols == 0) { + /* + * We are handling this block one row at a time (because + * this block has a different logical vs physical width, + * due to RAIDZ expansion), and this is a pad-only row, + * which has no parity. + */ + return; + } + + switch (rr->rr_firstdatacol) { case 1: - vdev_raidz_generate_parity_p(rm); + vdev_raidz_generate_parity_p(rr); break; case 2: - vdev_raidz_generate_parity_pq(rm); + vdev_raidz_generate_parity_pq(rr); break; case 3: - vdev_raidz_generate_parity_pqr(rm); + vdev_raidz_generate_parity_pqr(rr); break; default: cmn_err(CE_PANIC, "invalid RAID-Z configuration"); @@ -929,30 +1122,31 @@ } static int -vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts) +vdev_raidz_reconstruct_p(raidz_row_t *rr, int *tgts, int ntgts) { int x = tgts[0]; - int c; abd_t *dst, *src; - ASSERT(ntgts == 1); - ASSERT(x >= rm->rm_firstdatacol); - ASSERT(x < rm->rm_cols); + zfs_dbgmsg("reconstruct_p(rm=%p x=%u)", + rr, x); + + ASSERT3U(ntgts, ==, 1); + ASSERT3U(x, >=, rr->rr_firstdatacol); + ASSERT3U(x, <, rr->rr_cols); - ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_P].rc_size); - ASSERT(rm->rm_col[x].rc_size > 0); + ASSERT3U(rr->rr_col[x].rc_size, <=, rr->rr_col[VDEV_RAIDZ_P].rc_size); - src = rm->rm_col[VDEV_RAIDZ_P].rc_abd; - dst = rm->rm_col[x].rc_abd; + src = rr->rr_col[VDEV_RAIDZ_P].rc_abd; + dst = rr->rr_col[x].rc_abd; - abd_copy(dst, src, rm->rm_col[x].rc_size); + abd_copy(dst, src, rr->rr_col[x].rc_size); - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - uint64_t size = MIN(rm->rm_col[x].rc_size, - rm->rm_col[c].rc_size); + for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { + uint64_t size = MIN(rr->rr_col[x].rc_size, + rr->rr_col[c].rc_size); - src = rm->rm_col[c].rc_abd; - dst = rm->rm_col[x].rc_abd; + src = rr->rr_col[c].rc_abd; + dst = rr->rr_col[x].rc_abd; /* XXX not needed, done above */ if (c == x) continue; @@ -965,51 +1159,54 @@ } static int -vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts) +vdev_raidz_reconstruct_q(raidz_row_t *rr, int *tgts, int ntgts) { int x = tgts[0]; int c, exp; abd_t *dst, *src; + zfs_dbgmsg("reconstruct_q(rm=%p x=%u)", + rr, x); + ASSERT(ntgts == 1); - ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_Q].rc_size); + ASSERT(rr->rr_col[x].rc_size <= rr->rr_col[VDEV_RAIDZ_Q].rc_size); - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - uint64_t size = (c == x) ? 0 : MIN(rm->rm_col[x].rc_size, - rm->rm_col[c].rc_size); + for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { + uint64_t size = (c == x) ? 0 : MIN(rr->rr_col[x].rc_size, + rr->rr_col[c].rc_size); - src = rm->rm_col[c].rc_abd; - dst = rm->rm_col[x].rc_abd; + src = rr->rr_col[c].rc_abd; + dst = rr->rr_col[x].rc_abd; - if (c == rm->rm_firstdatacol) { + if (c == rr->rr_firstdatacol) { abd_copy(dst, src, size); - if (rm->rm_col[x].rc_size > size) + if (rr->rr_col[x].rc_size > size) abd_zero_off(dst, size, - rm->rm_col[x].rc_size - size); + rr->rr_col[x].rc_size - size); } else { - ASSERT3U(size, <=, rm->rm_col[x].rc_size); + ASSERT3U(size, <=, rr->rr_col[x].rc_size); (void) abd_iterate_func2(dst, src, 0, 0, size, vdev_raidz_reconst_q_pre_func, NULL); (void) abd_iterate_func(dst, - size, rm->rm_col[x].rc_size - size, + size, rr->rr_col[x].rc_size - size, vdev_raidz_reconst_q_pre_tail_func, NULL); } } - src = rm->rm_col[VDEV_RAIDZ_Q].rc_abd; - dst = rm->rm_col[x].rc_abd; - exp = 255 - (rm->rm_cols - 1 - x); + src = rr->rr_col[VDEV_RAIDZ_Q].rc_abd; + dst = rr->rr_col[x].rc_abd; + exp = 255 - (rr->rr_cols - 1 - x); struct reconst_q_struct rq = { abd_to_buf(src), exp }; - (void) abd_iterate_func(dst, 0, rm->rm_col[x].rc_size, + (void) abd_iterate_func(dst, 0, rr->rr_col[x].rc_size, vdev_raidz_reconst_q_post_func, &rq); return (1 << VDEV_RAIDZ_Q); } static int -vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) +vdev_raidz_reconstruct_pq(raidz_row_t *rr, int *tgts, int ntgts) { uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp; abd_t *pdata, *qdata; @@ -1018,12 +1215,15 @@ int y = tgts[1]; abd_t *xd, *yd; + zfs_dbgmsg("reconstruct_pq(rm=%p x=%u y=%u)", + rr, x, y); + ASSERT(ntgts == 2); ASSERT(x < y); - ASSERT(x >= rm->rm_firstdatacol); - ASSERT(y < rm->rm_cols); + ASSERT(x >= rr->rr_firstdatacol); + ASSERT(y < rr->rr_cols); - ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size); + ASSERT(rr->rr_col[x].rc_size >= rr->rr_col[y].rc_size); /* * Move the parity data aside -- we're going to compute parity as @@ -1032,29 +1232,29 @@ * parity so we make those columns appear to be full of zeros by * setting their lengths to zero. */ - pdata = rm->rm_col[VDEV_RAIDZ_P].rc_abd; - qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_abd; - xsize = rm->rm_col[x].rc_size; - ysize = rm->rm_col[y].rc_size; + pdata = rr->rr_col[VDEV_RAIDZ_P].rc_abd; + qdata = rr->rr_col[VDEV_RAIDZ_Q].rc_abd; + xsize = rr->rr_col[x].rc_size; + ysize = rr->rr_col[y].rc_size; - rm->rm_col[VDEV_RAIDZ_P].rc_abd = - abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_P].rc_size, B_TRUE); - rm->rm_col[VDEV_RAIDZ_Q].rc_abd = - abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_Q].rc_size, B_TRUE); - rm->rm_col[x].rc_size = 0; - rm->rm_col[y].rc_size = 0; + rr->rr_col[VDEV_RAIDZ_P].rc_abd = + abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_P].rc_size, B_TRUE); + rr->rr_col[VDEV_RAIDZ_Q].rc_abd = + abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_Q].rc_size, B_TRUE); + rr->rr_col[x].rc_size = 0; + rr->rr_col[y].rc_size = 0; - vdev_raidz_generate_parity_pq(rm); + vdev_raidz_generate_parity_pq(rr); - rm->rm_col[x].rc_size = xsize; - rm->rm_col[y].rc_size = ysize; + rr->rr_col[x].rc_size = xsize; + rr->rr_col[y].rc_size = ysize; p = abd_to_buf(pdata); q = abd_to_buf(qdata); - pxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd); - qxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd); - xd = rm->rm_col[x].rc_abd; - yd = rm->rm_col[y].rc_abd; + pxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); + qxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); + xd = rr->rr_col[x].rc_abd; + yd = rr->rr_col[y].rc_abd; /* * We now have: @@ -1072,7 +1272,7 @@ */ a = vdev_raidz_pow2[255 + x - y]; - b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)]; + b = vdev_raidz_pow2[255 - (rr->rr_cols - 1 - x)]; tmp = 255 - vdev_raidz_log2[a ^ 1]; aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)]; @@ -1085,14 +1285,14 @@ (void) abd_iterate_func(xd, ysize, xsize - ysize, vdev_raidz_reconst_pq_tail_func, &rpq); - abd_free(rm->rm_col[VDEV_RAIDZ_P].rc_abd); - abd_free(rm->rm_col[VDEV_RAIDZ_Q].rc_abd); + abd_free(rr->rr_col[VDEV_RAIDZ_P].rc_abd); + abd_free(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); /* * Restore the saved parity data. */ - rm->rm_col[VDEV_RAIDZ_P].rc_abd = pdata; - rm->rm_col[VDEV_RAIDZ_Q].rc_abd = qdata; + rr->rr_col[VDEV_RAIDZ_P].rc_abd = pdata; + rr->rr_col[VDEV_RAIDZ_Q].rc_abd = qdata; return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q)); } @@ -1249,13 +1449,13 @@ /* END CSTYLED */ static void -vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map, +vdev_raidz_matrix_init(raidz_row_t *rr, int n, int nmap, int *map, uint8_t **rows) { int i, j; int pow; - ASSERT(n == rm->rm_cols - rm->rm_firstdatacol); + ASSERT(n == rr->rr_cols - rr->rr_firstdatacol); /* * Fill in the missing rows of interest. @@ -1279,7 +1479,7 @@ } static void -vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing, +vdev_raidz_matrix_invert(raidz_row_t *rr, int n, int nmissing, int *missing, uint8_t **rows, uint8_t **invrows, const uint8_t *used) { int i, j, ii, jj; @@ -1291,10 +1491,10 @@ * correspond to data columns. */ for (i = 0; i < nmissing; i++) { - ASSERT3S(used[i], <, rm->rm_firstdatacol); + ASSERT3S(used[i], <, rr->rr_firstdatacol); } for (; i < n; i++) { - ASSERT3S(used[i], >=, rm->rm_firstdatacol); + ASSERT3S(used[i], >=, rr->rr_firstdatacol); } /* @@ -1311,8 +1511,8 @@ */ for (i = 0; i < nmissing; i++) { for (j = nmissing; j < n; j++) { - ASSERT3U(used[j], >=, rm->rm_firstdatacol); - jj = used[j] - rm->rm_firstdatacol; + ASSERT3U(used[j], >=, rr->rr_firstdatacol); + jj = used[j] - rr->rr_firstdatacol; ASSERT3S(jj, <, n); invrows[i][j] = rows[i][jj]; rows[i][jj] = 0; @@ -1373,7 +1573,7 @@ } static void -vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, +vdev_raidz_matrix_reconstruct(raidz_row_t *rr, int n, int nmissing, int *missing, uint8_t **invrows, const uint8_t *used) { int i, j, x, cc, c; @@ -1405,22 +1605,24 @@ for (i = 0; i < n; i++) { c = used[i]; - ASSERT3U(c, <, rm->rm_cols); + ASSERT3U(c, <, rr->rr_cols); - src = abd_to_buf(rm->rm_col[c].rc_abd); - ccount = rm->rm_col[c].rc_size; + ccount = rr->rr_col[c].rc_size; + ASSERT(ccount >= rr->rr_col[missing[0]].rc_size || i > 0); + if (ccount == 0) + continue; + src = abd_to_buf(rr->rr_col[c].rc_abd); for (j = 0; j < nmissing; j++) { - cc = missing[j] + rm->rm_firstdatacol; - ASSERT3U(cc, >=, rm->rm_firstdatacol); - ASSERT3U(cc, <, rm->rm_cols); + cc = missing[j] + rr->rr_firstdatacol; + ASSERT3U(cc, >=, rr->rr_firstdatacol); + ASSERT3U(cc, <, rr->rr_cols); ASSERT3U(cc, !=, c); - dst[j] = abd_to_buf(rm->rm_col[cc].rc_abd); - dcount[j] = rm->rm_col[cc].rc_size; + dcount[j] = rr->rr_col[cc].rc_size; + if (dcount[j] != 0) + dst[j] = abd_to_buf(rr->rr_col[cc].rc_abd); } - ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0); - for (x = 0; x < ccount; x++, src++) { if (*src != 0) log = vdev_raidz_log2[*src]; @@ -1449,13 +1651,15 @@ } static int -vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) +vdev_raidz_reconstruct_general(raidz_row_t *rr, int *tgts, int ntgts) { int n, i, c, t, tt; int nmissing_rows; int missing_rows[VDEV_RAIDZ_MAXPARITY]; int parity_map[VDEV_RAIDZ_MAXPARITY]; + zfs_dbgmsg("reconstruct_general(rm=%p ntgts=%u)", + rr, ntgts); uint8_t *p, *pp; size_t psize; @@ -1471,28 +1675,31 @@ * Matrix reconstruction can't use scatter ABDs yet, so we allocate * temporary linear ABDs. */ - if (!abd_is_linear(rm->rm_col[rm->rm_firstdatacol].rc_abd)) { - bufs = kmem_alloc(rm->rm_cols * sizeof (abd_t *), KM_PUSHPAGE); + if (!abd_is_linear(rr->rr_col[rr->rr_firstdatacol].rc_abd)) { + bufs = kmem_alloc(rr->rr_cols * sizeof (abd_t *), KM_PUSHPAGE); - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - raidz_col_t *col = &rm->rm_col[c]; + for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { + raidz_col_t *col = &rr->rr_col[c]; bufs[c] = col->rc_abd; - col->rc_abd = abd_alloc_linear(col->rc_size, B_TRUE); - abd_copy(col->rc_abd, bufs[c], col->rc_size); + if (bufs[c] != NULL) { + col->rc_abd = + abd_alloc_linear(col->rc_size, B_TRUE); + abd_copy(col->rc_abd, bufs[c], col->rc_size); + } } } - n = rm->rm_cols - rm->rm_firstdatacol; + n = rr->rr_cols - rr->rr_firstdatacol; /* * Figure out which data columns are missing. */ nmissing_rows = 0; for (t = 0; t < ntgts; t++) { - if (tgts[t] >= rm->rm_firstdatacol) { + if (tgts[t] >= rr->rr_firstdatacol) { missing_rows[nmissing_rows++] = - tgts[t] - rm->rm_firstdatacol; + tgts[t] - rr->rr_firstdatacol; } } @@ -1502,7 +1709,7 @@ */ for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) { ASSERT(tt < ntgts); - ASSERT(c < rm->rm_firstdatacol); + ASSERT(c < rr->rr_firstdatacol); /* * Skip any targeted parity columns. @@ -1537,9 +1744,9 @@ used[i] = parity_map[i]; } - for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + for (tt = 0, c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { if (tt < nmissing_rows && - c == missing_rows[tt] + rm->rm_firstdatacol) { + c == missing_rows[tt] + rr->rr_firstdatacol) { tt++; continue; } @@ -1552,18 +1759,18 @@ /* * Initialize the interesting rows of the matrix. */ - vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows); + vdev_raidz_matrix_init(rr, n, nmissing_rows, parity_map, rows); /* * Invert the matrix. */ - vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows, + vdev_raidz_matrix_invert(rr, n, nmissing_rows, missing_rows, rows, invrows, used); /* * Reconstruct the missing data using the generated matrix. */ - vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows, + vdev_raidz_matrix_reconstruct(rr, n, nmissing_rows, missing_rows, invrows, used); kmem_free(p, psize); @@ -1572,21 +1779,23 @@ * copy back from temporary linear abds and free them */ if (bufs) { - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - raidz_col_t *col = &rm->rm_col[c]; + for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { + raidz_col_t *col = &rr->rr_col[c]; - abd_copy(bufs[c], col->rc_abd, col->rc_size); - abd_free(col->rc_abd); + if (bufs[c] != NULL) { + abd_copy(bufs[c], col->rc_abd, col->rc_size); + abd_free(col->rc_abd); + } col->rc_abd = bufs[c]; } - kmem_free(bufs, rm->rm_cols * sizeof (abd_t *)); + kmem_free(bufs, rr->rr_cols * sizeof (abd_t *)); } return (code); } static int -vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt) +vdev_raidz_reconstruct_row(raidz_row_t *rr, int *t, int nt) { int tgts[VDEV_RAIDZ_MAXPARITY], *dt; int ntgts; @@ -1595,26 +1804,37 @@ int nbadparity, nbaddata; int parity_valid[VDEV_RAIDZ_MAXPARITY]; + zfs_dbgmsg("reconstruct(rm=%p nt=%u cols=%u md=%u mp=%u)", + rr, nt, (int)rr->rr_cols, (int)rr->rr_missingdata, (int)rr->rr_missingparity); + /* * The tgts list must already be sorted. */ + zfs_dbgmsg("reconstruct(rm=%p t[%u]=%u)", rr, 0, t[0]); for (i = 1; i < nt; i++) { + zfs_dbgmsg("reconstruct(rm=%p t[%u]=%u)", + rr, i, t[i]); ASSERT(t[i] > t[i - 1]); } - nbadparity = rm->rm_firstdatacol; - nbaddata = rm->rm_cols - nbadparity; + nbadparity = rr->rr_firstdatacol; + nbaddata = rr->rr_cols - nbadparity; ntgts = 0; - for (i = 0, c = 0; c < rm->rm_cols; c++) { - if (c < rm->rm_firstdatacol) + for (i = 0, c = 0; c < rr->rr_cols; c++) { + zfs_dbgmsg("reconstruct(rm=%p col=%u devid=%u offset=%llx error=%u)", + rr, c, + (int)rr->rr_col[c].rc_devidx, + (long long)rr->rr_col[c].rc_offset, + (int)rr->rr_col[c].rc_error); + if (c < rr->rr_firstdatacol) parity_valid[c] = B_FALSE; if (i < nt && c == t[i]) { tgts[ntgts++] = c; i++; - } else if (rm->rm_col[c].rc_error != 0) { + } else if (rr->rr_col[c].rc_error != 0) { tgts[ntgts++] = c; - } else if (c >= rm->rm_firstdatacol) { + } else if (c >= rr->rr_firstdatacol) { nbaddata--; } else { parity_valid[c] = B_TRUE; @@ -1635,30 +1855,30 @@ switch (nbaddata) { case 1: if (parity_valid[VDEV_RAIDZ_P]) - return (vdev_raidz_reconstruct_p(rm, dt, 1)); + return (vdev_raidz_reconstruct_p(rr, dt, 1)); - ASSERT(rm->rm_firstdatacol > 1); + ASSERT(rr->rr_firstdatacol > 1); if (parity_valid[VDEV_RAIDZ_Q]) - return (vdev_raidz_reconstruct_q(rm, dt, 1)); + return (vdev_raidz_reconstruct_q(rr, dt, 1)); - ASSERT(rm->rm_firstdatacol > 2); + ASSERT(rr->rr_firstdatacol > 2); break; case 2: - ASSERT(rm->rm_firstdatacol > 1); + ASSERT(rr->rr_firstdatacol > 1); if (parity_valid[VDEV_RAIDZ_P] && parity_valid[VDEV_RAIDZ_Q]) - return (vdev_raidz_reconstruct_pq(rm, dt, 2)); + return (vdev_raidz_reconstruct_pq(rr, dt, 2)); - ASSERT(rm->rm_firstdatacol > 2); + ASSERT(rr->rr_firstdatacol > 2); break; } } - code = vdev_raidz_reconstruct_general(rm, tgts, ntgts); + code = vdev_raidz_reconstruct_general(rr, tgts, ntgts); ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY)); ASSERT(code > 0); return (code); @@ -1668,8 +1888,8 @@ vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, uint64_t *logical_ashift, uint64_t *physical_ashift) { - vdev_t *cvd; - uint64_t nparity = vd->vdev_nparity; + vdev_raidz_t *vdrz = vd->vdev_tsd; + uint64_t nparity = vdrz->vd_nparity; int c; int lasterror = 0; int numerrors = 0; @@ -1685,7 +1905,7 @@ vdev_open_children(vd); for (c = 0; c < vd->vdev_children; c++) { - cvd = vd->vdev_child[c]; + vdev_t *cvd = vd->vdev_child[c]; if (cvd->vdev_open_error != 0) { lasterror = cvd->vdev_open_error; @@ -1786,9 +2006,10 @@ vdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size, uint64_t offset, uint64_t origoffset, boolean_t doread, boolean_t isdump) { + vdev_raidz_t *vdrz = vd->vdev_tsd; vdev_t *tvd = vd->vdev_top; vdev_t *cvd; - raidz_map_t *rm; + raidz_row_t *rr; raidz_col_t *rc; int c, err = 0; @@ -1818,15 +2039,19 @@ */ abd_t *abd = abd_get_from_buf(data - (offset - origoffset), SPA_OLD_MAXBLOCKSIZE); - rm = vdev_raidz_map_alloc(abd, + /* + * XXX deal with dump to expanded raidz + */ + raidz_map_t *rm = vdev_raidz_map_alloc(abd, SPA_OLD_MAXBLOCKSIZE, origoffset, B_FALSE, tvd->vdev_ashift, - vd->vdev_children, vd->vdev_nparity); + vd->vdev_children, vdrz->vd_nparity); + rr = rm->rm_row[0]; coloffset = origoffset; - for (c = rm->rm_firstdatacol; c < rm->rm_cols; + for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++, coloffset += rc->rc_size) { - rc = &rm->rm_col[c]; + rc = &rr->rr_col[c]; cvd = vd->vdev_child[rc->rc_devidx]; /* @@ -1863,7 +2088,7 @@ break; } - vdev_raidz_map_free(rm); + vdev_raidz_row_free(rr); abd_put(abd); #endif /* KERNEL */ @@ -1874,10 +2099,11 @@ static uint64_t vdev_raidz_asize(vdev_t *vd, uint64_t psize) { + vdev_raidz_t *vdrz = vd->vdev_tsd; uint64_t asize; uint64_t ashift = vd->vdev_top->vdev_ashift; - uint64_t cols = vd->vdev_children; - uint64_t nparity = vd->vdev_nparity; + uint64_t cols = vdrz->vd_logical_width; + uint64_t nparity = vdrz->vd_nparity; asize = ((psize - 1) >> ashift) + 1; asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity)); @@ -1896,119 +2122,137 @@ rc->rc_skipped = 0; } -/* - * Start an IO operation on a RAIDZ VDev - * - * Outline: - * - For write operations: - * 1. Generate the parity data - * 2. Create child zio write operations to each column's vdev, for both - * data and parity. - * 3. If the column skips any sectors for padding, create optional dummy - * write zio children for those areas to improve aggregation continuity. - * - For read operations: - * 1. Create child zio read operations to each data column's vdev to read - * the range of data required for zio. - * 2. If this is a scrub or resilver operation, or if any of the data - * vdevs have had errors, then create zio read operations to the parity - * columns' VDevs as well. - */ static void -vdev_raidz_io_start(zio_t *zio) +vdev_raidz_io_verify(zio_t *zio, raidz_row_t *rr, int col) { +#if 0 +#ifdef ZFS_DEBUG vdev_t *vd = zio->io_vd; vdev_t *tvd = vd->vdev_top; - vdev_t *cvd; - raidz_map_t *rm; - raidz_col_t *rc; - int c, i; - rm = vdev_raidz_map_alloc(zio->io_abd, zio->io_size, zio->io_offset, - zio->io_type == ZIO_TYPE_FREE, - tvd->vdev_ashift, vd->vdev_children, - vd->vdev_nparity); + range_seg_t logical_rs, physical_rs; + logical_rs.rs_start = zio->io_offset; + logical_rs.rs_end = logical_rs.rs_start + + vdev_raidz_asize(zio->io_vd, zio->io_size); - zio->io_vsd = rm; - zio->io_vsd_ops = &vdev_raidz_vsd_ops; + raidz_col_t *rc = &rr->rr_col[col]; + vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; - ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size)); + vdev_xlate(cvd, &logical_rs, &physical_rs); + ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start); + ASSERT3U(rc->rc_offset, <, physical_rs.rs_end); + /* + * It would be nice to assert that rs_end is equal + * to rc_offset + rc_size but there might be an + * optional I/O at the end that is not accounted in + * rc_size. + */ + if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) { + ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + + rc->rc_size + (1 << tvd->vdev_ashift)); + } else { + ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size); + } +#endif +#endif +} - if (zio->io_type == ZIO_TYPE_FREE) { - for (c = 0; c < rm->rm_cols; c++) { - rc = &rm->rm_col[c]; - cvd = vd->vdev_child[rc->rc_devidx]; - zio_nowait(zio_vdev_child_io(zio, NULL, cvd, - rc->rc_offset, rc->rc_abd, rc->rc_size, - zio->io_type, zio->io_priority, 0, - vdev_raidz_child_done, rc)); - } +static void +vdev_raidz_io_start_free(zio_t *zio, raidz_row_t *rr) +{ + vdev_t *vd = zio->io_vd; + vdev_t *tvd = vd->vdev_top; - zio_execute(zio); - return; + for (int c = 0; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + if (rc->rc_size == 0) + continue; + vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; + + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, + rc->rc_offset, rc->rc_abd, rc->rc_size, + zio->io_type, zio->io_priority, 0, + vdev_raidz_child_done, rc)); } +} - if (zio->io_type == ZIO_TYPE_WRITE) { - vdev_raidz_generate_parity(rm); +static void +vdev_raidz_io_start_write(zio_t *zio, raidz_row_t *rr) +{ + vdev_t *vd = zio->io_vd; + vdev_t *tvd = vd->vdev_top; - for (c = 0; c < rm->rm_cols; c++) { - rc = &rm->rm_col[c]; - cvd = vd->vdev_child[rc->rc_devidx]; - zio_nowait(zio_vdev_child_io(zio, NULL, cvd, - rc->rc_offset, rc->rc_abd, rc->rc_size, - zio->io_type, zio->io_priority, 0, - vdev_raidz_child_done, rc)); - } + vdev_raidz_generate_parity(rr); - /* - * Generate optional I/Os for any skipped sectors to improve - * aggregation contiguity. - */ - for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) { - ASSERT(c <= rm->rm_scols); - if (c == rm->rm_scols) - c = 0; - rc = &rm->rm_col[c]; - cvd = vd->vdev_child[rc->rc_devidx]; - zio_nowait(zio_vdev_child_io(zio, NULL, cvd, - rc->rc_offset + rc->rc_size, NULL, - 1 << tvd->vdev_ashift, - zio->io_type, zio->io_priority, - ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL)); - } + for (int c = 0; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + if (rc->rc_size == 0) + continue; + vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; - zio_execute(zio); - return; + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, + rc->rc_offset, rc->rc_abd, rc->rc_size, + zio->io_type, zio->io_priority, 0, + vdev_raidz_child_done, rc)); + } + + /* XXX do this in vdev_raidz_io_start, based on nskip stored in rm + */ +#if 0 + /* + * Generate optional I/Os for any skipped sectors to improve + * aggregation contiguity. + */ + for (int c = rr->rm_skipstart, i = 0; i < rr->rm_nskip; c++, i++) { + ASSERT(c <= rr->rm_scols); + if (c == rr->rm_scols) + c = 0; + raidz_col_t *rc = &rr->rr_col[c]; + vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, + rc->rc_offset + rc->rc_size, NULL, + 1 << tvd->vdev_ashift, + zio->io_type, zio->io_priority, + ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL)); } +#endif +} - ASSERT(zio->io_type == ZIO_TYPE_READ); +static void +vdev_raidz_io_start_read(zio_t *zio, raidz_row_t *rr, boolean_t forceparity) +{ + vdev_t *vd = zio->io_vd; /* * Iterate over the columns in reverse order so that we hit the parity * last -- any errors along the way will force us to read the parity. */ - for (c = rm->rm_cols - 1; c >= 0; c--) { - rc = &rm->rm_col[c]; - cvd = vd->vdev_child[rc->rc_devidx]; + for (int c = rr->rr_cols - 1; c >= 0; c--) { + raidz_col_t *rc = &rr->rr_col[c]; + if (rc->rc_size == 0) + continue; + vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; if (!vdev_readable(cvd)) { - if (c >= rm->rm_firstdatacol) - rm->rm_missingdata++; + if (c >= rr->rr_firstdatacol) + rr->rr_missingdata++; else - rm->rm_missingparity++; + rr->rr_missingparity++; rc->rc_error = SET_ERROR(ENXIO); rc->rc_tried = 1; /* don't even try */ rc->rc_skipped = 1; continue; } if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { - if (c >= rm->rm_firstdatacol) - rm->rm_missingdata++; + if (c >= rr->rr_firstdatacol) + rr->rr_missingdata++; else - rm->rm_missingparity++; + rr->rr_missingparity++; rc->rc_error = SET_ERROR(ESTALE); rc->rc_skipped = 1; continue; } - if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 || + if (forceparity || + c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 || (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset, rc->rc_abd, rc->rc_size, @@ -2016,6 +2260,75 @@ vdev_raidz_child_done, rc)); } } +} + +/* + * Start an IO operation on a RAIDZ VDev + * + * Outline: + * - For write operations: + * 1. Generate the parity data + * 2. Create child zio write operations to each column's vdev, for both + * data and parity. + * 3. If the column skips any sectors for padding, create optional dummy + * write zio children for those areas to improve aggregation continuity. + * - For read operations: + * 1. Create child zio read operations to each data column's vdev to read + * the range of data required for zio. + * 2. If this is a scrub or resilver operation, or if any of the data + * vdevs have had errors, then create zio read operations to the parity + * columns' VDevs as well. + */ +static void +vdev_raidz_io_start(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + vdev_t *tvd = vd->vdev_top; + vdev_raidz_t *vdrz = vd->vdev_tsd; + raidz_map_t *rm; + + ASSERT(!vdrz->vn_expanding); + + if (vdrz->vd_logical_width != vdrz->vd_physical_width) { + rm = vdev_raidz_map_alloc_expanded(zio->io_abd, + zio->io_size, zio->io_offset, + zio->io_type == ZIO_TYPE_FREE, + tvd->vdev_ashift, vdrz->vd_physical_width, + vdrz->vd_logical_width, vdrz->vd_nparity); + } else { + rm = vdev_raidz_map_alloc(zio->io_abd, + zio->io_size, zio->io_offset, + zio->io_type == ZIO_TYPE_FREE, + tvd->vdev_ashift, vdrz->vd_logical_width, + vdrz->vd_nparity); + } + + zio->io_vsd = rm; + zio->io_vsd_ops = &vdev_raidz_vsd_ops; + + if (zio->io_type == ZIO_TYPE_FREE) { + for (int i = 0; i < rm->rm_nrows; i++) { + vdev_raidz_io_start_free(zio, rm->rm_row[i]); + } + } else if (zio->io_type == ZIO_TYPE_WRITE) { + for (int i = 0; i < rm->rm_nrows; i++) { + vdev_raidz_io_start_write(zio, + rm->rm_row[i]); + } + } else { + ASSERT(zio->io_type == ZIO_TYPE_READ); + /* + * If there are multiple rows, we will be hitting + * all disks, so go ahead and read the parity so + * that we are reading in decent size chunks. + * XXX maybe doesn't really matter? + */ + boolean_t forceparity = rm->rm_nrows > 1; + for (int i = 0; i < rm->rm_nrows; i++) { + vdev_raidz_io_start_read(zio, + rm->rm_row[i], forceparity); + } + } zio_execute(zio); } @@ -2070,10 +2383,10 @@ * Generate the parity from the data columns. If we tried and were able to * read the parity without error, verify that the generated parity matches the * data we read. If it doesn't, we fire off a checksum error. Return the - * number such failures. + * number of such failures. */ static int -raidz_parity_verify(zio_t *zio, raidz_map_t *rm) +raidz_parity_verify(zio_t *zio, raidz_row_t *rr) { void *orig[VDEV_RAIDZ_MAXPARITY]; int c, ret = 0; @@ -2086,21 +2399,29 @@ if (checksum == ZIO_CHECKSUM_NOPARITY) return (ret); - for (c = 0; c < rm->rm_firstdatacol; c++) { - rc = &rm->rm_col[c]; + for (c = 0; c < rr->rr_firstdatacol; c++) { + rc = &rr->rr_col[c]; if (!rc->rc_tried || rc->rc_error != 0) continue; orig[c] = zio_buf_alloc(rc->rc_size); abd_copy_to_buf(orig[c], rc->rc_abd, rc->rc_size); } - vdev_raidz_generate_parity(rm); + /* XXX regenerates parity even for !tried||rc_error!=0 + * This could cause a side effect of fixing stuff we didn't realize + * was necessary (i.e. even if we return 0) + */ + vdev_raidz_generate_parity(rr); + + for (c = 0; c < rr->rr_firstdatacol; c++) { + rc = &rr->rr_col[c]; - for (c = 0; c < rm->rm_firstdatacol; c++) { - rc = &rm->rm_col[c]; if (!rc->rc_tried || rc->rc_error != 0) continue; + if (abd_cmp_buf(rc->rc_abd, orig[c], rc->rc_size) != 0) { + zfs_dbgmsg("raidz_parity_verify found error on col=%u devidx=%u", + c, (int)rc->rc_devidx); raidz_checksum_error(zio, rc, orig[c]); rc->rc_error = SET_ERROR(ECKSUM); ret++; @@ -2117,16 +2438,83 @@ static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY]; static int -vdev_raidz_worst_error(raidz_map_t *rm) +vdev_raidz_worst_error(raidz_row_t *rr) { int error = 0; - for (int c = 0; c < rm->rm_cols; c++) - error = zio_worst_error(error, rm->rm_col[c].rc_error); + for (int c = 0; c < rr->rr_cols; c++) + error = zio_worst_error(error, rr->rr_col[c].rc_error); return (error); } +static void +vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr) +{ + int unexpected_errors = 0; + int parity_errors = 0; + int parity_untried = 0; + int data_errors = 0; + + ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); + + for (int c = 0; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + + if (rc->rc_error) { + if (c < rr->rr_firstdatacol) + parity_errors++; + else + data_errors++; + + if (!rc->rc_skipped) + unexpected_errors++; + } else if (c < rr->rr_firstdatacol && !rc->rc_tried) { + parity_untried++; + } + } + + /* + * If we read more parity disks than were used for + * reconstruction, confirm that the other parity disks produced + * correct data. + * + * Note that we also regenerate parity when resilvering so we + * can write it out to failed devices later. + */ + zfs_dbgmsg("parity_errors=%u parity_untried=%u data_errors=%u verifying=%s", + parity_errors, parity_untried, data_errors, + (parity_errors + parity_untried < rr->rr_firstdatacol - data_errors) ? "yes" : "no"); + if (parity_errors + parity_untried < + rr->rr_firstdatacol - data_errors || + (zio->io_flags & ZIO_FLAG_RESILVER)) { + int n = raidz_parity_verify(zio, rr); + unexpected_errors += n; + ASSERT3U(parity_errors + n, <=, rr->rr_firstdatacol); + } + + if (zio->io_error == 0 && spa_writeable(zio->io_spa) && + (unexpected_errors > 0 || (zio->io_flags & ZIO_FLAG_RESILVER))) { + /* + * Use the good data we have in hand to repair damaged children. + */ + for (int c = 0; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + vdev_t *vd = zio->io_vd; + vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; + + if (rc->rc_error == 0 || rc->rc_size == 0) + continue; + + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, + rc->rc_offset, rc->rc_abd, rc->rc_size, + ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, + ZIO_FLAG_IO_REPAIR | (unexpected_errors ? + ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); + } + } +} + /* * Iterate over all combinations of bad data and attempt a reconstruction. * Note that the algorithm below is non-optimal because it doesn't take into @@ -2134,454 +2522,771 @@ * triple-parity RAID-Z the reconstruction procedure is the same if column 4 * is targeted as invalid as if columns 1 and 4 are targeted since in both * cases we'd only use parity information in column 0. + * + * The order that we find the various possible combinations of failed + * disks is dictated by these rules: + * - Examine each "slot" (the "i" in tgts[i]) + * - Try to increment this slot (tgts[i] = tgts[i] + 1) + * - if we can't increment because it runs into the next slot, + * reset our slot to the minimum, and examine the next slot + * For example, with a 6-wide RAIDZ3, and no known errors (so we have to choose + * 3 columns to reconstruct), we will generate the following sequence: + * + * STATE ACTION + * 0 1 2 special case: skip since these are all parity + * 0 1 3 first slot: reset to 0; middle slot: increment to 2 + * 0 2 3 first slot: increment to 1 + * 1 2 3 first: reset to 0; middle: reset to 1; last: increment to 4 + * 0 1 4 first: reset to 0; middle: increment to 2 + * 0 2 4 first: increment to 1 + * 1 2 4 first: reset to 0; middle: increment to 3 + * 0 3 4 first: increment to 1 + * 1 3 4 first: increment to 2 + * 2 3 4 first: reset to 0; middle: reset to 1; last: increment to 5 + * 0 1 5 first: reset to 0; middle: increment to 2 + * 0 2 5 first: increment to 1 + * 1 2 5 first: reset to 0; middle: increment to 3 + * 0 3 5 first: increment to 1 + * 1 3 5 first: increment to 2 + * 2 3 5 first: reset to 0; middle: increment to 4 + * 0 4 5 first: increment to 1 + * 1 4 5 first: increment to 2 + * 2 4 5 first: increment to 3 + * 3 4 5 done */ -static int -vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors) -{ - raidz_map_t *rm = zio->io_vsd; - raidz_col_t *rc; - void *orig[VDEV_RAIDZ_MAXPARITY]; - int tstore[VDEV_RAIDZ_MAXPARITY + 2]; - int *tgts = &tstore[1]; - int current, next, i, c, n; - int code, ret = 0; - ASSERT(total_errors < rm->rm_firstdatacol); +/* + * Should this sector be considered failed for logical child ID i? + * XXX comment explaining logical child ID's + */ +static boolean_t +raidz_simulate_failure(vdev_raidz_t *vdrz, int ashift, int i, raidz_col_t *rc) +{ + uint64_t sector_id = + vdrz->vd_physical_width * (rc->rc_offset >> ashift) + + rc->rc_devidx; + +#if 0 + zfs_dbgmsg("raidz_simulate_failure(pw=%u lw=%u ashift=%u i=%u rc_offset=%llx rc_devidx=%u sector_id=%u", + vdrz->vd_physical_width, + vdrz->vd_logical_width, + ashift, + i, + (long long)rc->rc_offset, + (int)rc->rc_devidx, + (long long)sector_id); +#endif - /* - * This simplifies one edge condition. - */ - tgts[-1] = -1; + for (int w = vdrz->vd_physical_width; + w >= vdrz->vd_logical_width; w--) { + if (i < w) { + return (sector_id % w == i); + } else { + i -= w; + } + } + ASSERT(!"invalid logical child id"); + return (B_FALSE); +} - for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) { - /* - * Initialize the targets array by finding the first n columns - * that contain no error. - * - * If there were no data errors, we need to ensure that we're - * always explicitly attempting to reconstruct at least one - * data column. To do this, we simply push the highest target - * up into the data columns. - */ - for (c = 0, i = 0; i < n; i++) { - if (i == n - 1 && data_errors == 0 && - c < rm->rm_firstdatacol) { - c = rm->rm_firstdatacol; +static void +raidz_restore_orig_data(raidz_map_t *rm) +{ + for (int i = 0; i < rm->rm_nrows; i++) { + raidz_row_t *rr = rm->rm_row[i]; + for (int c = 0; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + if (rc->rc_need_orig_restore) { + abd_copy_from_buf(rc->rc_abd, + rc->rc_orig_data, rc->rc_size); + rc->rc_need_orig_restore = B_FALSE; } + } + } +} - while (rm->rm_col[c].rc_error != 0) { - c++; - ASSERT3S(c, <, rm->rm_cols); +/* + * returns EINVAL if reconstruction of the block will not be possible + * returns ECKSUM if this specific reconstruction failed + * returns 0 on successful reconstruction + */ +static int +raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts) +{ + raidz_map_t *rm = zio->io_vsd; + vdev_raidz_t *vdrz = zio->io_vd->vdev_tsd; + + zfs_dbgmsg("raidz_reconstruct_expanded(zio=%p ltgts=%u,%u,%u ntgts=%u", + zio, ltgts[0], ltgts[1], ltgts[2], ntgts); + + /* Reconstruct each row */ + for (int r = 0; r < rm->rm_nrows; r++) { + raidz_row_t *rr = rm->rm_row[r]; + int my_tgts[VDEV_RAIDZ_MAXPARITY]; /* value is child id */ + int t = 0; + int dead = 0; + int dead_data = 0; + + zfs_dbgmsg("raidz_reconstruct_expanded(row=%u)", + r); + + for (int c = 0; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + ASSERT0(rc->rc_need_orig_restore); + if (rc->rc_error != 0) { + dead++; + if (c >= vdrz->vd_nparity) + dead_data++; + continue; + } + if (rc->rc_size == 0) + continue; + for (int lt = 0; lt < ntgts; lt++) { + if (raidz_simulate_failure(vdrz, + zio->io_vd->vdev_top->vdev_ashift, + ltgts[lt], rc)) { + if (rc->rc_orig_data == NULL) { + rc->rc_orig_data = + zio_buf_alloc(rc->rc_size); + abd_copy_to_buf(rc->rc_orig_data, + rc->rc_abd, rc->rc_size); + } + rc->rc_need_orig_restore = B_TRUE; + + dead++; + if (c >= vdrz->vd_nparity) + dead_data++; + my_tgts[t++] = c; + zfs_dbgmsg("simulating failure of col %u devidx %u", + c, (int)rc->rc_devidx); + break; + } } - - tgts[i] = c++; } - - /* - * Setting tgts[n] simplifies the other edge condition. - */ - tgts[n] = rm->rm_cols; - - /* - * These buffers were allocated in previous iterations. - */ - for (i = 0; i < n - 1; i++) { - ASSERT(orig[i] != NULL); + if (dead > vdrz->vd_nparity) { + /* reconstruction not possible */ + zfs_dbgmsg("reconstruction not possible; too many failures"); + raidz_restore_orig_data(rm); + return (EINVAL); } + rr->rr_code = 0; + if (dead_data > 0) + rr->rr_code = vdev_raidz_reconstruct_row(rr, my_tgts, t); + } - orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size); + /* Check for success */ + if (raidz_checksum_verify(zio) == 0) { + + /* Reconstruction succeeded - report errors */ + for (int i = 0; i < rm->rm_nrows; i++) { + raidz_row_t *rr = rm->rm_row[i]; + atomic_inc_64(&raidz_corrected[rr->rr_code]); + + for (int c = 0; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + if (rc->rc_need_orig_restore) { + /* + * Note: if this is a parity column, + * we don't really know if it's wrong. + * We need to let + * vdev_raidz_io_done_verified() check + * it, and if we set rc_error, it will + * think that it is a "known" error + * that doesn't need to be checked + * or corrected. + */ + if (rc->rc_error == 0 && c >= rr->rr_firstdatacol) { + raidz_checksum_error(zio, rc, rc->rc_gdata); + rc->rc_error = SET_ERROR(ECKSUM); + } + rc->rc_need_orig_restore = B_FALSE; + } + } - current = 0; - next = tgts[current]; + vdev_raidz_io_done_verified(zio, rr); + } - while (current != n) { - tgts[current] = next; - current = 0; + zio_checksum_verified(zio); - /* - * Save off the original data that we're going to - * attempt to reconstruct. - */ - for (i = 0; i < n; i++) { - ASSERT(orig[i] != NULL); - c = tgts[i]; - ASSERT3S(c, >=, 0); - ASSERT3S(c, <, rm->rm_cols); - rc = &rm->rm_col[c]; - abd_copy_to_buf(orig[i], rc->rc_abd, - rc->rc_size); - } + zfs_dbgmsg("reconstruction successful (checksum verified)"); + return (0); + } - /* - * Attempt a reconstruction and exit the outer loop on - * success. - */ - code = vdev_raidz_reconstruct(rm, tgts, n); - if (raidz_checksum_verify(zio) == 0) { - atomic_inc_64(&raidz_corrected[code]); - - for (i = 0; i < n; i++) { - c = tgts[i]; - rc = &rm->rm_col[c]; - ASSERT(rc->rc_error == 0); - if (rc->rc_tried) - raidz_checksum_error(zio, rc, - orig[i]); - rc->rc_error = SET_ERROR(ECKSUM); - } + /* Reconstruction failed - restore original data */ + raidz_restore_orig_data(rm); + zfs_dbgmsg("raidz_reconstruct_expanded(zio=%p) checksum failed", + zio); + return (ECKSUM); +} - ret = code; - goto done; - } +/* + * return 0 on success, ECKSUM on failure + */ +static int +vdev_raidz_combrec(zio_t *zio) +{ + raidz_map_t *rm = zio->io_vsd; + vdev_raidz_t *vdrz = zio->io_vd->vdev_tsd; + + for (int num_failures = 1; num_failures <= vdrz->vd_nparity; + num_failures++) { + int tstore[VDEV_RAIDZ_MAXPARITY + 2]; + int *ltgts = &tstore[1]; /* value is logical child ID */ + + /* Determine number of logical children, n */ + int n = 0; + for (int w = vdrz->vd_physical_width; + w >= vdrz->vd_logical_width; w--) { + n += w; + } - /* - * Restore the original data. - */ - for (i = 0; i < n; i++) { - c = tgts[i]; - rc = &rm->rm_col[c]; - abd_copy_from_buf(rc->rc_abd, orig[i], - rc->rc_size); - } + ASSERT3U(num_failures, <=, vdrz->vd_nparity); + ASSERT3U(num_failures, <=, VDEV_RAIDZ_MAXPARITY); + /* handle corner cases in combrec logic */ + ltgts[-1] = -1; + for (int i = 0; i < num_failures; i++) { + ltgts[i] = i; + } + ltgts[num_failures] = n; - do { + for (;;) { + int err = raidz_reconstruct(zio, + ltgts, num_failures); + if (err == EINVAL) { /* - * Find the next valid column after the current - * position.. + * Reconstruction not possible with this # + * failures; try more failures. */ - for (next = tgts[current] + 1; - next < rm->rm_cols && - rm->rm_col[next].rc_error != 0; next++) - continue; + break; + } else if (err == 0) + return (0); + + /* Compute next targets to try */ + for (int t = 0; ; t++) { + ASSERT3U(t, <, num_failures); + ltgts[t]++; + if (ltgts[t] == n) { + ASSERT3U(t, ==, num_failures - 1); + zfs_dbgmsg("reconstruction failed for num_failures=%u; tried all combinations", + num_failures); + break; // try more failures + } - ASSERT(next <= tgts[current + 1]); + ASSERT3U(ltgts[t], <, n); + ASSERT3U(ltgts[t], <=, ltgts[t + 1]); /* * If that spot is available, we're done here. */ - if (next != tgts[current + 1]) - break; + if (ltgts[t] != ltgts[t + 1]) + break; // found next combination /* - * Otherwise, find the next valid column after - * the previous position. + * Otherwise, reset this tgt to the minimum, + * and move on to the next tgt. */ - for (c = tgts[current - 1] + 1; - rm->rm_col[c].rc_error != 0; c++) - continue; - - tgts[current] = c; - current++; - - } while (current != n); + ltgts[t] = ltgts[t - 1] + 1; + ASSERT3U(ltgts[t], ==, t); + } + if (ltgts[num_failures - 1] == n) + break; // try more failures } } - n--; -done: - for (i = 0; i < n; i++) { - zio_buf_free(orig[i], rm->rm_col[0].rc_size); - } - - return (ret); + zfs_dbgmsg("reconstruction failed for all num_failures"); + return (ECKSUM); } /* - * Complete an IO operation on a RAIDZ VDev + * Complete a write IO operation on a RAIDZ VDev * * Outline: - * - For write operations: * 1. Check for errors on the child IOs. * 2. Return, setting an error code if too few child VDevs were written * to reconstruct the data later. Note that partial writes are * considered successful if they can be reconstructed at all. - * - For read operations: - * 1. Check for errors on the child IOs. - * 2. If data errors occurred: - * a. Try to reassemble the data from the parity available. - * b. If we haven't yet read the parity drives, read them now. - * c. If all parity drives have been read but the data still doesn't - * reassemble with a correct checksum, then try combinatorial - * reconstruction. - * d. If that doesn't work, return an error. - * 3. If there were unexpected errors or this is a resilver operation, - * rewrite the vdevs that had errors. */ static void -vdev_raidz_io_done(zio_t *zio) +vdev_raidz_io_done_write_impl(zio_t *zio, raidz_row_t *rr) +{ + int total_errors = 0; + + ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol); + ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol); + ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); + + for (int c = 0; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + + if (rc->rc_error) { + ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ + + total_errors++; + } + } + + /* + * XXX -- for now, treat partial writes as a success. + * (If we couldn't write enough columns to reconstruct + * the data, the I/O failed. Otherwise, good enough.) + * + * Now that we support write reallocation, it would be better + * to treat partial failure as real failure unless there are + * no non-degraded top-level vdevs left, and not update DTLs + * if we intend to reallocate. + */ + /* XXPOLICY */ + if (total_errors > rr->rr_firstdatacol) { + zio->io_error = zio_worst_error(zio->io_error, + vdev_raidz_worst_error(rr)); + } +} + +/* + * return 0 if no reconstruction occurred, otherwise the "code" from + * vdev_raidz_reconstruct(). + */ +static int +vdev_raidz_io_done_reconstruct_known_missing(zio_t *zio, raidz_row_t *rr) { - vdev_t *vd = zio->io_vd; - vdev_t *cvd; - raidz_map_t *rm = zio->io_vsd; - raidz_col_t *rc; - int unexpected_errors = 0; int parity_errors = 0; int parity_untried = 0; int data_errors = 0; int total_errors = 0; - int n, c; - int tgts[VDEV_RAIDZ_MAXPARITY]; - int code; - - ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */ + int code = 0; - ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol); - ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol); + ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol); + ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol); + ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); - for (c = 0; c < rm->rm_cols; c++) { - rc = &rm->rm_col[c]; + for (int c = 0; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; if (rc->rc_error) { ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ - if (c < rm->rm_firstdatacol) + if (c < rr->rr_firstdatacol) parity_errors++; else data_errors++; - if (!rc->rc_skipped) - unexpected_errors++; - total_errors++; - } else if (c < rm->rm_firstdatacol && !rc->rc_tried) { + } else if (c < rr->rr_firstdatacol && !rc->rc_tried) { parity_untried++; } } - if (zio->io_type == ZIO_TYPE_WRITE) { + /* + * If there were data errors and the number of errors we saw was + * correctable -- less than or equal to the number of parity disks read + * -- reconstruct based on the missing data. + */ + if (data_errors != 0 && + total_errors <= rr->rr_firstdatacol - parity_untried) { /* - * XXX -- for now, treat partial writes as a success. - * (If we couldn't write enough columns to reconstruct - * the data, the I/O failed. Otherwise, good enough.) - * - * Now that we support write reallocation, it would be better - * to treat partial failure as real failure unless there are - * no non-degraded top-level vdevs left, and not update DTLs - * if we intend to reallocate. + * We either attempt to read all the parity columns or + * none of them. If we didn't try to read parity, we + * wouldn't be here in the correctable case. There must + * also have been fewer parity errors than parity + * columns or, again, we wouldn't be in this code path. */ - /* XXPOLICY */ - if (total_errors > rm->rm_firstdatacol) - zio->io_error = vdev_raidz_worst_error(rm); + ASSERT(parity_untried == 0); + ASSERT(parity_errors < rr->rr_firstdatacol); - return; - } else if (zio->io_type == ZIO_TYPE_FREE) { - return; + /* + * Identify the data columns that reported an error. + */ + int n = 0; + int tgts[VDEV_RAIDZ_MAXPARITY]; + for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + if (rc->rc_error != 0) { + ASSERT(n < VDEV_RAIDZ_MAXPARITY); + tgts[n++] = c; + } + } + + ASSERT(rr->rr_firstdatacol >= n); + + code = vdev_raidz_reconstruct_row(rr, tgts, n); } - ASSERT(zio->io_type == ZIO_TYPE_READ); - /* - * There are three potential phases for a read: - * 1. produce valid data from the columns read - * 2. read all disks and try again - * 3. perform combinatorial reconstruction - * - * Each phase is progressively both more expensive and less likely to - * occur. If we encounter more errors than we can repair or all phases - * fail, we have no choice but to return an error. - */ + return (code); +} - /* - * If the number of errors we saw was correctable -- less than or equal - * to the number of parity disks read -- attempt to produce data that - * has a valid checksum. Naturally, this case applies in the absence of - * any errors. - */ - if (total_errors <= rm->rm_firstdatacol - parity_untried) { - if (data_errors == 0) { - if (raidz_checksum_verify(zio) == 0) { - /* - * If we read parity information (unnecessarily - * as it happens since no reconstruction was - * needed) regenerate and verify the parity. - * We also regenerate parity when resilvering - * so we can write it out to the failed device - * later. - */ - if (parity_errors + parity_untried < - rm->rm_firstdatacol || - (zio->io_flags & ZIO_FLAG_RESILVER)) { - n = raidz_parity_verify(zio, rm); - unexpected_errors += n; - ASSERT(parity_errors + n <= - rm->rm_firstdatacol); - } - goto done; +/* + * return the number of reads issued. + */ +static int +vdev_raidz_read_all(zio_t *zio, raidz_row_t *rr) +{ + vdev_t *vd = zio->io_vd; + int nread = 0; + + rr->rr_missingdata = 0; + rr->rr_missingparity = 0; + + for (int c = 0; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + if (rc->rc_tried || rc->rc_size == 0) + continue; + + zio_nowait(zio_vdev_child_io(zio, NULL, + vd->vdev_child[rc->rc_devidx], + rc->rc_offset, rc->rc_abd, rc->rc_size, + zio->io_type, zio->io_priority, 0, + vdev_raidz_child_done, rc)); + nread++; + } + return (nread); +} + +static void +vdev_raidz_io_done(zio_t *zio) +{ + raidz_map_t *rm = zio->io_vsd; + vdev_raidz_t *vdrz = zio->io_vd->vdev_tsd; + + ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */ + if (zio->io_type == ZIO_TYPE_WRITE) { + for (int i = 0; i < rm->rm_nrows; i++) { + vdev_raidz_io_done_write_impl(zio, rm->rm_row[i]); + } + } else if (zio->io_type == ZIO_TYPE_FREE) { + return; + } else { + for (int i = 0; i < rm->rm_nrows; i++) { + raidz_row_t *rr = rm->rm_row[i]; + rr->rr_code = + vdev_raidz_io_done_reconstruct_known_missing(zio, + rr); + } + + if (raidz_checksum_verify(zio) == 0) { + for (int i = 0; i < rm->rm_nrows; i++) { + raidz_row_t *rr = rm->rm_row[i]; + atomic_inc_64(&raidz_corrected[rr->rr_code]); + vdev_raidz_io_done_verified(zio, rr); } + zio_checksum_verified(zio); } else { /* - * We either attempt to read all the parity columns or - * none of them. If we didn't try to read parity, we - * wouldn't be here in the correctable case. There must - * also have been fewer parity errors than parity - * columns or, again, we wouldn't be in this code path. + * This isn't a typical situation -- either we got a + * read error or a child silently returned bad data. + * Read every block so we can try again with as much + * data and parity as we can track down. If we've + * already been through once before, all children will + * be marked as tried so we'll proceed to combinatorial + * reconstruction. */ - ASSERT(parity_untried == 0); - ASSERT(parity_errors < rm->rm_firstdatacol); - + int nread = 0; + for (int i = 0; i < rm->rm_nrows; i++) { + nread += vdev_raidz_read_all(zio, + rm->rm_row[i]); + } + if (nread != 0) { + /* + * Normally our stage is VDEV_IO_DONE, but if + * we've already called redone(), it will have + * changed to VDEV_IO_START, in which case we + * don't want to call redone() again. + */ + if (zio->io_stage != ZIO_STAGE_VDEV_IO_START) + zio_vdev_io_redone(zio); + return; + } /* - * Identify the data columns that reported an error. + * It would be too expensive to try every possible + * combination of failed sectors in every row, so + * instead we try every combination of failed current or + * past physical disk. This means that if the incorrect + * sectors were all on Nparity disks at any point in the + * past, we will find the correct data. I think that + * the only case where this is less durable than + * a non-expanded RAIDZ, is if we have a silent + * failure during expansion. In that case, one block + * could be partially in the old format and partially + * in the new format, so we'd lost some sectors + * from the old format and some from the new format. + * + * e.g. logical_width=4 physical_width=6 + * the 15 (6+5+4) possible failed disks are: + * width=6 child=0 + * width=6 child=1 + * width=6 child=2 + * width=6 child=3 + * width=6 child=4 + * width=6 child=5 + * width=5 child=0 + * width=5 child=1 + * width=5 child=2 + * width=5 child=3 + * width=5 child=4 + * width=4 child=0 + * width=4 child=1 + * width=4 child=2 + * width=4 child=3 + * And we will try every combination of Nparity of these + * failing. + * + * As a first pass, we can generate every combo, + * and try reconstructing, ignoring any known + * failures. If any row has too many known + simulated + * failures, then we bail on reconstructing with this + * number of simulated failures. As an improvement, + * we could detect the number of whole known failures + * (i.e. we have known failures on these disks for + * every row; the disks never succeeded), and + * subtract that from the max # failures to simulate. + * We could go even further like the current + * combrec code, but that doesn't seem like it + * gains us very much. If we simulate a failure + * that is also a known failure, that's fine. */ - n = 0; - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - rc = &rm->rm_col[c]; - if (rc->rc_error != 0) { - ASSERT(n < VDEV_RAIDZ_MAXPARITY); - tgts[n++] = c; - } - } - - ASSERT(rm->rm_firstdatacol >= n); - - code = vdev_raidz_reconstruct(rm, tgts, n); - - if (raidz_checksum_verify(zio) == 0) { - atomic_inc_64(&raidz_corrected[code]); - + if (vdev_raidz_combrec(zio) != 0) { /* - * If we read more parity disks than were used - * for reconstruction, confirm that the other - * parity disks produced correct data. This - * routine is suboptimal in that it regenerates - * the parity that we already used in addition - * to the parity that we're attempting to - * verify, but this should be a relatively - * uncommon case, and can be optimized if it - * becomes a problem. Note that we regenerate - * parity when resilvering so we can write it - * out to failed devices later. + * We're here because either: + * + * total_errors == rm_first_datacol, or + * vdev_raidz_combrec() failed + * + * In either case, there is enough bad data to prevent + * reconstruction. + * + * Start checksum ereports for all children which haven't + * failed, and the IO wasn't speculative. */ - if (parity_errors < rm->rm_firstdatacol - n || - (zio->io_flags & ZIO_FLAG_RESILVER)) { - n = raidz_parity_verify(zio, rm); - unexpected_errors += n; - ASSERT(parity_errors + n <= - rm->rm_firstdatacol); + zio->io_error = SET_ERROR(ECKSUM); + + if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { + for (int i = 0; i < rm->rm_nrows; i++) { + raidz_row_t *rr = rm->rm_row[i]; + for (int c = 0; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + if (rc->rc_error == 0) { + zio_bad_cksum_t zbc; + zbc.zbc_has_cksum = 0; + zbc.zbc_injected = + rm->rm_ecksuminjected; + + zfs_ereport_start_checksum( + zio->io_spa, + zio->io_vd->vdev_child[rc->rc_devidx], + zio, rc->rc_offset, rc->rc_size, + (void *)(uintptr_t)c, &zbc); + } + } + } } - - goto done; } } } + ASSERT(!vdrz->vn_expanding); +} + +static void +vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) +{ + vdev_raidz_t *vdrz = vd->vdev_tsd; + if (faulted > vdrz->vd_nparity) + vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_NO_REPLICAS); + else if (degraded + faulted != 0) + vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); + else + vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); +} + +static void +raidz_copy_range(void *arg, uint64_t start, uint64_t size) +{ + vdev_t *vd = arg; + int ashift = vd->vdev_top->vdev_ashift; + int old_children = vd->vdev_children - 1; + spa_t *spa = vd->vdev_spa; + + ASSERT(IS_P2ALIGNED(start, 1 << ashift)); + ASSERT(IS_P2ALIGNED(size, 1 << ashift)); + + abd_t *abd = abd_alloc_for_io(1 << ashift, B_FALSE); + for (uint64_t i = MAX(start >> ashift, old_children); + i < (start + size) >> ashift; i++) { + int child = i % old_children; + int offset = (i / old_children) << ashift; + spa_config_enter(spa, SCL_STATE, spa, RW_READER); + VERIFY0(zio_wait(zio_read_phys(NULL, + vd->vdev_child[child], + offset + VDEV_LABEL_START_SIZE, + 1 << ashift, abd, + ZIO_CHECKSUM_OFF, NULL, NULL, + ZIO_PRIORITY_REMOVAL, 0, B_FALSE))); + + child = i % vd->vdev_children; + offset = (i / vd->vdev_children) << ashift; + VERIFY0(zio_wait(zio_write_phys(NULL, + vd->vdev_child[child], + offset + VDEV_LABEL_START_SIZE, + 1 << ashift, abd, + ZIO_CHECKSUM_OFF, NULL, NULL, + ZIO_PRIORITY_REMOVAL, 0, B_FALSE))); + spa_config_exit(spa, SCL_STATE, spa); + } + abd_free(abd); +} + +void +vdev_raidz_attach_sync(void *arg, dmu_tx_t *tx) +{ + vdev_t *new_child = arg; + spa_t *spa = new_child->vdev_spa; + vdev_t *raidvd = new_child->vdev_parent; + vdev_raidz_t *vdrz = raidvd->vdev_tsd; + ASSERT3P(raidvd->vdev_ops, ==, &vdev_raidz_ops); + ASSERT3P(raidvd->vdev_top, ==, raidvd); + ASSERT3U(raidvd->vdev_children, >, vdrz->vd_logical_width); + ASSERT3U(raidvd->vdev_children, ==, vdrz->vd_physical_width + 1); /* - * This isn't a typical situation -- either we got a read error or - * a child silently returned bad data. Read every block so we can - * try again with as much data and parity as we can track down. If - * we've already been through once before, all children will be marked - * as tried so we'll proceed to combinatorial reconstruction. + * XXX assuming that no other i/o takes place while this is happening, + * until we increment physical_width. But ZIL could do i/o. */ - unexpected_errors = 1; - rm->rm_missingdata = 0; - rm->rm_missingparity = 0; + vdrz->vn_expanding = B_TRUE; - for (c = 0; c < rm->rm_cols; c++) { - if (rm->rm_col[c].rc_tried) - continue; + /*spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);*/ - zio_vdev_io_redone(zio); - do { - rc = &rm->rm_col[c]; - if (rc->rc_tried) - continue; - zio_nowait(zio_vdev_child_io(zio, NULL, - vd->vdev_child[rc->rc_devidx], - rc->rc_offset, rc->rc_abd, rc->rc_size, - zio->io_type, zio->io_priority, 0, - vdev_raidz_child_done, rc)); - } while (++c < rm->rm_cols); + range_tree_t *rt = range_tree_create(NULL, NULL); - return; + for (uint64_t i = 0; i < raidvd->vdev_ms_count; i++) { + metaslab_t *msp = raidvd->vdev_ms[i]; + + /*vdev_initialize_ms_mark(msp);*/ + mutex_enter(&msp->ms_lock); + + metaslab_load_wait(msp); + if (!msp->ms_loaded) + VERIFY0(metaslab_load(msp)); + + /* + * We want to copy everything except the free (allocatable) + * space. Note that there may be a little bit more free + * space (e.g. in ms_defer), and it's fine to copy that too. + */ + ASSERT(range_tree_is_empty(rt)); + range_tree_add(rt, msp->ms_start, msp->ms_size); + range_tree_walk(msp->ms_allocatable, range_tree_remove, rt); + mutex_exit(&msp->ms_lock); + + /*spa_config_exit(spa, SCL_CONFIG, FTAG);*/ + /* Note, _vacate() doesn't visit in order */ + range_tree_walk(rt, raidz_copy_range, raidvd); + range_tree_vacate(rt, NULL, NULL); + /*vdev_initialize_ms_unmark(msp);*/ + /*spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);*/ } + /*spa_config_exit(spa, SCL_CONFIG, FTAG);*/ + range_tree_destroy(rt); + + vdrz->vd_physical_width++; + +#if 0 + raidvd->vdev_expanding = B_TRUE; + vdev_reopen(raidvd); + raidvd->vdev_expanding = B_FALSE; +#endif + + vdrz->vn_expanding = B_FALSE; + /* Ensure that widths get written to label config */ + vdev_config_dirty(raidvd); +} + +/* + * Add RAIDZ-specific fields to the config nvlist. + * XXX add this to vdev_ops_t? + */ +void +vdev_raidz_config_generate(vdev_t *vd, nvlist_t *nv) +{ + spa_t *spa = vd->vdev_spa; + ASSERT3P(vd->vdev_ops, ==, &vdev_raidz_ops); + vdev_raidz_t *vdrz = vd->vdev_tsd; + + /* + * Make sure someone hasn't managed to sneak a fancy new vdev + * into a crufty old storage pool. + */ + ASSERT(vdrz->vd_nparity == 1 || + (vdrz->vd_nparity <= 2 && + spa_version(spa) >= SPA_VERSION_RAIDZ2) || + (vdrz->vd_nparity <= 3 && + spa_version(spa) >= SPA_VERSION_RAIDZ3)); + /* - * At this point we've attempted to reconstruct the data given the - * errors we detected, and we've attempted to read all columns. There - * must, therefore, be one or more additional problems -- silent errors - * resulting in invalid data rather than explicit I/O errors resulting - * in absent data. We check if there is enough additional data to - * possibly reconstruct the data and then perform combinatorial - * reconstruction over all possible combinations. If that fails, - * we're cooked. + * Note that we'll add these even on storage pools where they + * aren't strictly required -- older software will just ignore + * it. */ - if (total_errors > rm->rm_firstdatacol) { - zio->io_error = vdev_raidz_worst_error(rm); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vdrz->vd_nparity); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_RAIDZ_LOGICAL_WIDTH, + vdrz->vd_logical_width); +} + +/* + * Set RAIDZ-specific fields in the vdev_t, based on the config. + * Can't assume that anything about the vdev_t is already set. + * XXX add this to vdev_ops_t? + */ +void * +vdev_raidz_get_tsd(spa_t *spa, nvlist_t *nv) +{ + uint64_t nparity, lw; + vdev_raidz_t *vdrz = kmem_zalloc(sizeof (*vdrz), KM_SLEEP); + + uint_t children; + nvlist_t **child; + int error = nvlist_lookup_nvlist_array(nv, + ZPOOL_CONFIG_CHILDREN, &child, &children); + if (error != 0) + goto out; + + vdrz->vd_logical_width = children; + vdrz->vd_physical_width = children; + + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RAIDZ_LOGICAL_WIDTH, + &lw) == 0) { + vdrz->vd_logical_width = lw; + } - } else if (total_errors < rm->rm_firstdatacol && - (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) { + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, + &nparity) == 0) { + if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY) + goto out; /* - * If we didn't use all the available parity for the - * combinatorial reconstruction, verify that the remaining - * parity is correct. + * Previous versions could only support 1 or 2 parity + * device. */ - if (code != (1 << rm->rm_firstdatacol) - 1) - (void) raidz_parity_verify(zio, rm); + if (nparity > 1 && + spa_version(spa) < SPA_VERSION_RAIDZ2) + goto out; + if (nparity > 2 && + spa_version(spa) < SPA_VERSION_RAIDZ3) + goto out; } else { /* - * We're here because either: - * - * total_errors == rm_first_datacol, or - * vdev_raidz_combrec() failed - * - * In either case, there is enough bad data to prevent - * reconstruction. - * - * Start checksum ereports for all children which haven't - * failed, and the IO wasn't speculative. + * We require the parity to be specified for SPAs that + * support multiple parity levels. */ - zio->io_error = SET_ERROR(ECKSUM); - - if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { - for (c = 0; c < rm->rm_cols; c++) { - rc = &rm->rm_col[c]; - if (rc->rc_error == 0) { - zio_bad_cksum_t zbc; - zbc.zbc_has_cksum = 0; - zbc.zbc_injected = - rm->rm_ecksuminjected; - - zfs_ereport_start_checksum( - zio->io_spa, - vd->vdev_child[rc->rc_devidx], - zio, rc->rc_offset, rc->rc_size, - (void *)(uintptr_t)c, &zbc); - } - } - } - } - -done: - zio_checksum_verified(zio); - - if (zio->io_error == 0 && spa_writeable(zio->io_spa) && - (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) { + if (spa_version(spa) >= SPA_VERSION_RAIDZ2) + goto out; /* - * Use the good data we have in hand to repair damaged children. + * Otherwise, we default to 1 parity device for RAID-Z. */ - for (c = 0; c < rm->rm_cols; c++) { - rc = &rm->rm_col[c]; - cvd = vd->vdev_child[rc->rc_devidx]; - - if (rc->rc_error == 0) - continue; - - zio_nowait(zio_vdev_child_io(zio, NULL, cvd, - rc->rc_offset, rc->rc_abd, rc->rc_size, - ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, - ZIO_FLAG_IO_REPAIR | (unexpected_errors ? - ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); - } + nparity = 1; } -} - -static void -vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) -{ - if (faulted > vd->vdev_nparity) - vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_NO_REPLICAS); - else if (degraded + faulted != 0) - vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); - else - vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); + vdrz->vd_nparity = nparity; + return (vdrz); +out: + kmem_free(vdrz, sizeof (*vdrz)); + return (NULL); } vdev_ops_t vdev_raidz_ops = { Index: sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h =================================================================== --- sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h +++ sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h @@ -556,6 +556,7 @@ #define ZPOOL_CONFIG_SPARES "spares" #define ZPOOL_CONFIG_IS_SPARE "is_spare" #define ZPOOL_CONFIG_NPARITY "nparity" +#define ZPOOL_CONFIG_RAIDZ_LOGICAL_WIDTH "raidz_logical_width" #define ZPOOL_CONFIG_HOSTID "hostid" #define ZPOOL_CONFIG_HOSTNAME "hostname" #define ZPOOL_CONFIG_LOADED_TIME "initial_load_time"