Page MenuHomeFreeBSD

D15124.diff
No OneTemporary

D15124.diff

Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/abd.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/abd.c
+++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/abd.c
@@ -758,6 +758,9 @@
int ret = 0;
struct abd_iter aiter;
+ if (size == 0)
+ return (ret);
+
abd_verify(abd);
ASSERT3U(off + size, <=, abd->abd_size);
@@ -886,6 +889,9 @@
int ret = 0;
struct abd_iter daiter, saiter;
+ if (size == 0)
+ return (ret);
+
abd_verify(dabd);
abd_verify(sabd);
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c
+++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c
@@ -91,7 +91,7 @@
{
reference_t *ref;
- ASSERT(rc->rc_count == number);
+ ASSERT3U(rc->rc_count, ==, number);
while (ref = list_head(&rc->rc_list)) {
list_remove(&rc->rc_list, ref);
kmem_cache_free(reference_cache, ref);
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
+++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
@@ -55,6 +55,7 @@
#include <sys/vdev_removal.h>
#include <sys/vdev_indirect_mapping.h>
#include <sys/vdev_indirect_births.h>
+#include <sys/vdev_raidz.h>
#include <sys/metaslab.h>
#include <sys/metaslab_impl.h>
#include <sys/uberblock_impl.h>
@@ -5923,8 +5924,9 @@
vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd;
vdev_ops_t *pvops;
char *oldvdpath, *newvdpath;
- int newvd_isspare;
+ int newvd_isspare = B_FALSE;
int error;
+ boolean_t raidz = B_FALSE;
ASSERT(spa_writeable(spa));
@@ -5947,10 +5949,16 @@
if (oldvd == NULL)
return (spa_vdev_exit(spa, NULL, txg, ENODEV));
- if (!oldvd->vdev_ops->vdev_op_leaf)
+ if (oldvd->vdev_ops == &vdev_raidz_ops) {
+ raidz = B_TRUE;
+ } else if (!oldvd->vdev_ops->vdev_op_leaf) {
return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
+ }
- pvd = oldvd->vdev_parent;
+ if (raidz)
+ pvd = oldvd;
+ else
+ pvd = oldvd->vdev_parent;
if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0,
VDEV_ALLOC_ATTACH)) != 0)
@@ -5979,6 +5987,7 @@
* vdev.
*/
if (pvd->vdev_ops != &vdev_mirror_ops &&
+ pvd->vdev_ops != &vdev_raidz_ops &&
pvd->vdev_ops != &vdev_root_ops)
return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
@@ -6018,7 +6027,8 @@
/*
* Make sure the new device is big enough.
*/
- if (newvd->vdev_asize < vdev_get_min_asize(oldvd))
+ vdev_t *min_vdev = raidz ? oldvd->vdev_child[0] : oldvd;
+ if (newvd->vdev_asize < vdev_get_min_asize(min_vdev))
return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW));
/*
@@ -6028,35 +6038,48 @@
if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift)
return (spa_vdev_exit(spa, newrootvd, txg, EDOM));
+ if (raidz) {
+ oldvdpath = kmem_asprintf("raidz%u-%u",
+ oldvd->vdev_nparity, oldvd->vdev_id);
+ } else {
+ oldvdpath = spa_strdup(oldvd->vdev_path);
+ }
+ newvdpath = spa_strdup(newvd->vdev_path);
+
/*
* If this is an in-place replacement, update oldvd's path and devid
* to make it distinguishable from newvd, and unopenable from now on.
*/
- if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) {
+ if (strcmp(oldvdpath, newvdpath) == 0) {
spa_strfree(oldvd->vdev_path);
- oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5,
+ oldvd->vdev_path = kmem_alloc(strlen(newvdpath) + 5,
KM_SLEEP);
- (void) sprintf(oldvd->vdev_path, "%s/%s",
- newvd->vdev_path, "old");
+ (void) sprintf(oldvd->vdev_path, "%s/old",
+ newvdpath);
if (oldvd->vdev_devid != NULL) {
spa_strfree(oldvd->vdev_devid);
oldvd->vdev_devid = NULL;
}
+ spa_strfree(oldvdpath);
+ oldvdpath = spa_strdup(oldvd->vdev_path);
}
/* mark the device being resilvered */
- newvd->vdev_resilver_txg = txg;
+ if (!raidz)
+ newvd->vdev_resilver_txg = txg;
/*
* If the parent is not a mirror, or if we're replacing, insert the new
* mirror/replacing/spare vdev above oldvd.
*/
- if (pvd->vdev_ops != pvops)
+ if (!raidz && pvd->vdev_ops != pvops)
pvd = vdev_add_parent(oldvd, pvops);
ASSERT(pvd->vdev_top->vdev_parent == rvd);
+#if 0
ASSERT(pvd->vdev_ops == pvops);
ASSERT(oldvd->vdev_parent == pvd);
+#endif
/*
* Extract the new device from its root and add it to pvd.
@@ -6079,29 +6102,34 @@
*/
dtl_max_txg = txg + TXG_CONCURRENT_STATES;
- vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL,
- dtl_max_txg - TXG_INITIAL);
+ if (raidz) {
+ dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
+ dsl_sync_task_nowait(spa->spa_dsl_pool, vdev_raidz_attach_sync,
+ newvd, 0, ZFS_SPACE_CHECK_EXTRA_RESERVED, tx);
+ dmu_tx_commit(tx);
+ } else {
+ vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL,
+ dtl_max_txg - TXG_INITIAL);
- if (newvd->vdev_isspare) {
- spa_spare_activate(newvd);
- spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_SPARE);
- }
+ if (newvd->vdev_isspare) {
+ spa_spare_activate(newvd);
+ spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_SPARE);
+ }
- oldvdpath = spa_strdup(oldvd->vdev_path);
- newvdpath = spa_strdup(newvd->vdev_path);
- newvd_isspare = newvd->vdev_isspare;
+ newvd_isspare = newvd->vdev_isspare;
- /*
- * Mark newvd's DTL dirty in this txg.
- */
- vdev_dirty(tvd, VDD_DTL, newvd, txg);
+ /*
+ * Mark newvd's DTL dirty in this txg.
+ */
+ vdev_dirty(tvd, VDD_DTL, newvd, txg);
- /*
- * Schedule the resilver to restart in the future. We do this to
- * ensure that dmu_sync-ed blocks have been stitched into the
- * respective datasets.
- */
- dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg);
+ /*
+ * Schedule the resilver to restart in the future. We do this to
+ * ensure that dmu_sync-ed blocks have been stitched into the
+ * respective datasets.
+ */
+ dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg);
+ }
if (spa->spa_bootfs)
spa_event_notify(spa, newvd, NULL, ESC_ZFS_BOOTFS_VDEV_ATTACH);
@@ -6113,6 +6141,10 @@
*/
(void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0);
+ if (raidz) {
+ error = vdev_online(spa, guid, ZFS_ONLINE_EXPAND, NULL);
+ }
+
spa_history_log_internal(spa, "vdev attach", NULL,
"%s vdev=%s %s vdev=%s",
replacing && newvd_isspare ? "spare in" :
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_raidz.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_raidz.h
+++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_raidz.h
@@ -39,10 +39,18 @@
extern "C" {
#endif
-#ifdef _KERNEL
+typedef struct vdev_raidz {
+ int vd_logical_width;
+ int vd_physical_width;
+ int vd_nparity;
+ boolean_t vn_expanding;
+} vdev_raidz_t;
+
extern int vdev_raidz_physio(vdev_t *,
caddr_t, size_t, uint64_t, uint64_t, boolean_t, boolean_t);
-#endif
+extern void vdev_raidz_attach_sync(void *, dmu_tx_t *);
+extern void vdev_raidz_config_generate(vdev_t *, nvlist_t *);
+extern void *vdev_raidz_get_tsd(spa_t *, nvlist_t *);
#ifdef __cplusplus
}
#endif
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
+++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
@@ -49,6 +49,7 @@
#include <sys/arc.h>
#include <sys/zil.h>
#include <sys/dsl_scan.h>
+#include <sys/vdev_raidz.h>
#include <sys/abd.h>
#include <sys/trim_map.h>
@@ -584,7 +585,7 @@
{
vdev_ops_t *ops;
char *type;
- uint64_t guid = 0, islog, nparity;
+ uint64_t guid = 0, islog;
vdev_t *vd;
vdev_indirect_config_t *vic;
@@ -637,47 +638,21 @@
if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES)
return (SET_ERROR(ENOTSUP));
- /*
- * Set the nparity property for RAID-Z vdevs.
- */
- nparity = -1ULL;
+ void *tsd = NULL;
+ int nparity = 0;
if (ops == &vdev_raidz_ops) {
- if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY,
- &nparity) == 0) {
- if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY)
- return (SET_ERROR(EINVAL));
- /*
- * Previous versions could only support 1 or 2 parity
- * device.
- */
- if (nparity > 1 &&
- spa_version(spa) < SPA_VERSION_RAIDZ2)
- return (SET_ERROR(ENOTSUP));
- if (nparity > 2 &&
- spa_version(spa) < SPA_VERSION_RAIDZ3)
- return (SET_ERROR(ENOTSUP));
- } else {
- /*
- * We require the parity to be specified for SPAs that
- * support multiple parity levels.
- */
- if (spa_version(spa) >= SPA_VERSION_RAIDZ2)
- return (SET_ERROR(EINVAL));
- /*
- * Otherwise, we default to 1 parity device for RAID-Z.
- */
- nparity = 1;
- }
- } else {
- nparity = 0;
+ vdev_raidz_t *rz = tsd = vdev_raidz_get_tsd(spa, nv);
+ if (rz == NULL)
+ return (SET_ERROR(EINVAL));
+ nparity = rz->vd_nparity;
}
- ASSERT(nparity != -1ULL);
vd = vdev_alloc_common(spa, id, guid, ops);
vic = &vd->vdev_indirect_config;
vd->vdev_islog = islog;
vd->vdev_nparity = nparity;
+ vd->vdev_tsd = tsd;
if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0)
vd->vdev_path = spa_strdup(vd->vdev_path);
@@ -849,6 +824,11 @@
ASSERT(vd->vdev_child == NULL);
ASSERT(vd->vdev_guid_sum == vd->vdev_guid);
+ if (vd->vdev_ops == &vdev_raidz_ops) {
+ vdev_raidz_t *rz = vd->vdev_tsd;
+ kmem_free(rz, sizeof(*rz));
+ }
+
/*
* Discard allocation state.
*/
@@ -3155,8 +3135,10 @@
if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
return (spa_vdev_state_exit(spa, NULL, ENODEV));
+#if 0
if (!vd->vdev_ops->vdev_op_leaf)
return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
+#endif
wasoffline = (vd->vdev_offline || vd->vdev_tmpoffline);
oldstate = vd->vdev_state;
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect.c
+++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect.c
@@ -1078,6 +1078,13 @@
if (vd->vdev_ops == &vdev_indirect_ops)
return;
+ printf("vdev_indirect_io_start_cb: src=%llx split_offset=%x dst: vd=%u off=%llx size=%x\n",
+ (long long)zio->io_offset,
+ (int)split_offset,
+ (int)vd->vdev_id,
+ (long long)offset,
+ (int)size);
+
zio_nowait(zio_vdev_child_io(zio, NULL, vd, offset,
abd_get_offset(zio->io_abd, split_offset),
size, zio->io_type, zio->io_priority,
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c
+++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c
@@ -141,6 +141,7 @@
#include <sys/zap.h>
#include <sys/vdev.h>
#include <sys/vdev_impl.h>
+#include <sys/vdev_raidz.h>
#include <sys/uberblock_impl.h>
#include <sys/metaslab.h>
#include <sys/metaslab_impl.h>
@@ -276,31 +277,13 @@
if (vd->vdev_fru != NULL)
fnvlist_add_string(nv, ZPOOL_CONFIG_FRU, vd->vdev_fru);
- if (vd->vdev_nparity != 0) {
- ASSERT(strcmp(vd->vdev_ops->vdev_op_type,
- VDEV_TYPE_RAIDZ) == 0);
+ if (vd->vdev_ops == &vdev_raidz_ops)
+ vdev_raidz_config_generate(vd, nv);
- /*
- * Make sure someone hasn't managed to sneak a fancy new vdev
- * into a crufty old storage pool.
- */
- ASSERT(vd->vdev_nparity == 1 ||
- (vd->vdev_nparity <= 2 &&
- spa_version(spa) >= SPA_VERSION_RAIDZ2) ||
- (vd->vdev_nparity <= 3 &&
- spa_version(spa) >= SPA_VERSION_RAIDZ3));
-
- /*
- * Note that we'll add the nparity tag even on storage pools
- * that only support a single parity device -- older software
- * will just ignore it.
- */
- fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vd->vdev_nparity);
- }
-
- if (vd->vdev_wholedisk != -1ULL)
+ if (vd->vdev_wholedisk != -1ULL) {
fnvlist_add_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
vd->vdev_wholedisk);
+ }
if (vd->vdev_not_present && !(flags & VDEV_CONFIG_MISSING))
fnvlist_add_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 1);
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c
+++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c
@@ -28,12 +28,14 @@
#include <sys/zfs_context.h>
#include <sys/spa.h>
+#include <sys/zap.h>
#include <sys/vdev_impl.h>
#ifdef illumos
#include <sys/vdev_disk.h>
#endif
#include <sys/vdev_file.h>
#include <sys/vdev_raidz.h>
+#include <sys/metaslab_impl.h>
#include <sys/zio.h>
#include <sys/zio_checksum.h>
#include <sys/abd.h>
@@ -41,6 +43,12 @@
#include <sys/fm/fs/zfs.h>
#include <sys/bio.h>
+#if 0
+#ifdef ZFS_DEBUG
+#include <sys/vdev_initialize.h> /* vdev_xlate testing */
+#endif
+#endif
+
/*
* Virtual device vector for RAID-Z.
*
@@ -113,27 +121,31 @@
uint64_t rc_offset; /* device offset */
uint64_t rc_size; /* I/O size */
abd_t *rc_abd; /* I/O data */
+ void *rc_orig_data; /* pre-reconstruction */
void *rc_gdata; /* used to store the "good" version */
int rc_error; /* I/O error for this device */
uint8_t rc_tried; /* Did we attempt this I/O column? */
uint8_t rc_skipped; /* Did we skip this I/O column? */
+ uint8_t rc_need_orig_restore; /* need to restore from orig_data? */
} raidz_col_t;
+typedef struct raidz_row {
+ uint64_t rr_cols; /* Regular column count */
+ uint64_t rr_missingdata; /* Count of missing data devices */
+ uint64_t rr_missingparity; /* Count of missing parity devices */
+ uint64_t rr_firstdatacol; /* First data column/parity count */
+ abd_t *rr_abd_copy; /* rm_asize-buffer of copied data */
+ int rr_code; /* reconstruction code */
+ raidz_col_t rr_col[0]; /* Flexible array of I/O columns */
+} raidz_row_t;
+
typedef struct raidz_map {
- uint64_t rm_cols; /* Regular column count */
- uint64_t rm_scols; /* Count including skipped columns */
- uint64_t rm_bigcols; /* Number of oversized columns */
- uint64_t rm_asize; /* Actual total I/O size */
- uint64_t rm_missingdata; /* Count of missing data devices */
- uint64_t rm_missingparity; /* Count of missing parity devices */
- uint64_t rm_firstdatacol; /* First data column/parity count */
- uint64_t rm_nskip; /* Skipped sectors for padding */
- uint64_t rm_skipstart; /* Column index of padding start */
- abd_t *rm_abd_copy; /* rm_asize-buffer of copied data */
uintptr_t rm_reports; /* # of referencing checksum reports */
- uint8_t rm_freed; /* map no longer has referencing ZIO */
- uint8_t rm_ecksuminjected; /* checksum error was injected */
- raidz_col_t rm_col[1]; /* Flexible array of I/O columns */
+ boolean_t rm_freed; /* map no longer has referencing ZIO */
+ boolean_t rm_ecksuminjected; /* checksum error was injected */
+ int rm_nrows;
+ int rm_nskip; /* Sectors skipped for padding */
+ raidz_row_t *rm_row[0]; /* flexible array of rows */
} raidz_map_t;
#define VDEV_RAIDZ_P 0
@@ -241,7 +253,7 @@
0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
};
-static void vdev_raidz_generate_parity(raidz_map_t *rm);
+static void vdev_raidz_generate_parity(raidz_row_t *);
/*
* Multiply a given number by 2 raised to the given power.
@@ -263,31 +275,46 @@
}
static void
-vdev_raidz_map_free(raidz_map_t *rm)
+vdev_raidz_row_free(raidz_row_t *rr)
{
int c;
- size_t size;
- for (c = 0; c < rm->rm_firstdatacol; c++) {
- if (rm->rm_col[c].rc_abd != NULL)
- abd_free(rm->rm_col[c].rc_abd);
+ for (c = 0; c < rr->rr_firstdatacol && c < rr->rr_cols; c++) {
+ if (rr->rr_col[c].rc_abd != NULL)
+ abd_free(rr->rr_col[c].rc_abd);
- if (rm->rm_col[c].rc_gdata != NULL)
- zio_buf_free(rm->rm_col[c].rc_gdata,
- rm->rm_col[c].rc_size);
+ if (rr->rr_col[c].rc_gdata != NULL) {
+ zio_buf_free(rr->rr_col[c].rc_gdata,
+ rr->rr_col[c].rc_size);
+ }
+ if (rr->rr_col[c].rc_orig_data != NULL) {
+ zio_buf_free(rr->rr_col[c].rc_orig_data,
+ rr->rr_col[c].rc_size);
+ }
}
- size = 0;
- for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
- if (rm->rm_col[c].rc_abd != NULL)
- abd_put(rm->rm_col[c].rc_abd);
- size += rm->rm_col[c].rc_size;
+ for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
+ if (rr->rr_col[c].rc_abd != NULL)
+ abd_put(rr->rr_col[c].rc_abd);
+ if (rr->rr_col[c].rc_orig_data != NULL) {
+ zio_buf_free(rr->rr_col[c].rc_orig_data,
+ rr->rr_col[c].rc_size);
+ }
}
- if (rm->rm_abd_copy != NULL)
- abd_free(rm->rm_abd_copy);
+ if (rr->rr_abd_copy != NULL)
+ abd_free(rr->rr_abd_copy);
- kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
+ kmem_free(rr, offsetof(raidz_row_t, rr_col[rr->rr_cols]));
+}
+
+static void
+vdev_raidz_map_free(raidz_map_t *rm)
+{
+ for (int i = 0; i < rm->rm_nrows; i++) {
+ vdev_raidz_row_free(rm->rm_row[i]);
+ }
+ kmem_free(rm, offsetof(raidz_map_t, rm_row[rm->rm_nrows]));
}
static void
@@ -296,10 +323,11 @@
raidz_map_t *rm = zio->io_vsd;
ASSERT0(rm->rm_freed);
- rm->rm_freed = 1;
+ rm->rm_freed = B_TRUE;
- if (rm->rm_reports == 0)
+ if (rm->rm_reports == 0) {
vdev_raidz_map_free(rm);
+ }
}
/*ARGSUSED*/
@@ -310,7 +338,7 @@
ASSERT3U(rm->rm_reports, >, 0);
- if (--rm->rm_reports == 0 && rm->rm_freed != 0)
+ if (--rm->rm_reports == 0 && rm->rm_freed)
vdev_raidz_map_free(rm);
}
@@ -324,18 +352,22 @@
const char *good = NULL;
char *bad;
+ zfs_dbgmsg("checksum error on rm=%p", rm);
+
if (good_data == NULL) {
zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
return;
}
- if (c < rm->rm_firstdatacol) {
+ zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
+#if 0
+ if (c < rm->rr_firstdatacol) {
/*
* The first time through, calculate the parity blocks for
* the good data (this relies on the fact that the good
* data never changes for a given logical ZIO)
*/
- if (rm->rm_col[0].rc_gdata == NULL) {
+ if (rm->rr_col[0].rc_gdata == NULL) {
abd_t *bad_parity[VDEV_RAIDZ_MAXPARITY];
char *buf;
int offset;
@@ -345,22 +377,22 @@
* good_data, first saving the parity bufs and
* replacing them with buffers to hold the result.
*/
- for (x = 0; x < rm->rm_firstdatacol; x++) {
- bad_parity[x] = rm->rm_col[x].rc_abd;
- rm->rm_col[x].rc_gdata =
- zio_buf_alloc(rm->rm_col[x].rc_size);
- rm->rm_col[x].rc_abd =
- abd_get_from_buf(rm->rm_col[x].rc_gdata,
- rm->rm_col[x].rc_size);
+ for (x = 0; x < rm->rr_firstdatacol; x++) {
+ bad_parity[x] = rm->rr_col[x].rc_abd;
+ rm->rr_col[x].rc_gdata =
+ zio_buf_alloc(rm->rr_col[x].rc_size);
+ rm->rr_col[x].rc_abd =
+ abd_get_from_buf(rm->rr_col[x].rc_gdata,
+ rm->rr_col[x].rc_size);
}
/* fill in the data columns from good_data */
buf = (char *)good_data;
- for (; x < rm->rm_cols; x++) {
- abd_put(rm->rm_col[x].rc_abd);
- rm->rm_col[x].rc_abd = abd_get_from_buf(buf,
- rm->rm_col[x].rc_size);
- buf += rm->rm_col[x].rc_size;
+ for (; x < rm->rr_cols; x++) {
+ abd_put(rm->rr_col[x].rc_abd);
+ rm->rr_col[x].rc_abd = abd_get_from_buf(buf,
+ rm->rr_col[x].rc_size);
+ buf += rm->rr_col[x].rc_size;
}
/*
@@ -369,34 +401,35 @@
vdev_raidz_generate_parity(rm);
/* restore everything back to its original state */
- for (x = 0; x < rm->rm_firstdatacol; x++) {
- abd_put(rm->rm_col[x].rc_abd);
- rm->rm_col[x].rc_abd = bad_parity[x];
+ for (x = 0; x < rm->rr_firstdatacol; x++) {
+ abd_put(rm->rr_col[x].rc_abd);
+ rm->rr_col[x].rc_abd = bad_parity[x];
}
offset = 0;
- for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
- abd_put(rm->rm_col[x].rc_abd);
- rm->rm_col[x].rc_abd = abd_get_offset(
- rm->rm_abd_copy, offset);
- offset += rm->rm_col[x].rc_size;
+ for (x = rm->rr_firstdatacol; x < rm->rr_cols; x++) {
+ abd_put(rm->rr_col[x].rc_abd);
+ rm->rr_col[x].rc_abd = abd_get_offset(
+ rm->rr_abd_copy, offset);
+ offset += rm->rr_col[x].rc_size;
}
}
- ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
- good = rm->rm_col[c].rc_gdata;
+ ASSERT3P(rm->rr_col[c].rc_gdata, !=, NULL);
+ good = rm->rr_col[c].rc_gdata;
} else {
/* adjust good_data to point at the start of our column */
good = good_data;
- for (x = rm->rm_firstdatacol; x < c; x++)
- good += rm->rm_col[x].rc_size;
+ for (x = rm->rr_firstdatacol; x < c; x++)
+ good += rm->rr_col[x].rc_size;
}
- bad = abd_borrow_buf_copy(rm->rm_col[c].rc_abd, rm->rm_col[c].rc_size);
+ bad = abd_borrow_buf_copy(rm->rr_col[c].rc_abd, rm->rr_col[c].rc_size);
/* we drop the ereport if it ends up that the data was good */
zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
- abd_return_buf(rm->rm_col[c].rc_abd, bad, rm->rm_col[c].rc_size);
+ abd_return_buf(rm->rr_col[c].rc_abd, bad, rm->rr_col[c].rc_size);
+#endif
}
/*
@@ -409,10 +442,7 @@
vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
{
size_t c = (size_t)(uintptr_t)arg;
- size_t offset;
-
raidz_map_t *rm = zio->io_vsd;
- size_t size;
/* set up the report and bump the refcount */
zcr->zcr_cbdata = rm;
@@ -423,7 +453,7 @@
rm->rm_reports++;
ASSERT3U(rm->rm_reports, >, 0);
- if (rm->rm_abd_copy != NULL)
+ if (rm->rm_row[0]->rr_abd_copy != NULL)
return;
/*
@@ -435,24 +465,33 @@
* to copy them.
*/
- size = 0;
- for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
- size += rm->rm_col[c].rc_size;
+ for (int i = 0; i < rm->rm_nrows; i++) {
+ raidz_row_t *rr = rm->rm_row[i];
+ size_t offset;
+ size_t size = 0;
+ for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++)
+ size += rr->rr_col[c].rc_size;
+
+ rr->rr_abd_copy =
+ abd_alloc_sametype(rr->rr_col[rr->rr_firstdatacol].rc_abd,
+ size);
- rm->rm_abd_copy =
- abd_alloc_sametype(rm->rm_col[rm->rm_firstdatacol].rc_abd, size);
+ for (offset = 0, c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
+ raidz_col_t *col = &rr->rr_col[c];
+
+ if (col->rc_size == 0)
+ continue;
- for (offset = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
- raidz_col_t *col = &rm->rm_col[c];
- abd_t *tmp = abd_get_offset(rm->rm_abd_copy, offset);
+ abd_t *tmp = abd_get_offset(rr->rr_abd_copy, offset);
- abd_copy(tmp, col->rc_abd, col->rc_size);
- abd_put(col->rc_abd);
- col->rc_abd = tmp;
+ abd_copy(tmp, col->rc_abd, col->rc_size);
+ abd_put(col->rc_abd);
+ col->rc_abd = tmp;
- offset += col->rc_size;
+ offset += col->rc_size;
+ }
+ ASSERT3U(offset, ==, size);
}
- ASSERT3U(offset, ==, size);
}
static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
@@ -468,7 +507,7 @@
vdev_raidz_map_alloc(abd_t *abd, uint64_t size, uint64_t offset, boolean_t dofree,
uint64_t unit_shift, uint64_t dcols, uint64_t nparity)
{
- raidz_map_t *rm;
+ raidz_row_t *rr;
/* The starting RAIDZ (parent) vdev sector of the block. */
uint64_t b = offset >> unit_shift;
/* The zio's size in units of the vdev's minimum sector size. */
@@ -477,9 +516,13 @@
uint64_t f = b % dcols;
/* The starting byte offset on each child vdev. */
uint64_t o = (b / dcols) << unit_shift;
- uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
+ uint64_t q, r, c, bc, col, acols, coff, devidx, asize, tot;
uint64_t off = 0;
+ raidz_map_t *rm =
+ kmem_zalloc(offsetof(raidz_map_t, rm_row[1]), KM_SLEEP);
+ rm->rm_nrows = 1;
+
/*
* "Quotient": The number of data sectors for this stripe on all but
* the "big column" child vdevs that also contain "remainder" data.
@@ -502,77 +545,63 @@
tot = s + nparity * (q + (r == 0 ? 0 : 1));
/* acols: The columns that will be accessed. */
- /* scols: The columns that will be accessed or skipped. */
if (q == 0) {
/* Our I/O request doesn't span all child vdevs. */
acols = bc;
- scols = MIN(dcols, roundup(bc, nparity + 1));
} else {
acols = dcols;
- scols = dcols;
}
- ASSERT3U(acols, <=, scols);
-
- rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
+ rr = kmem_alloc(offsetof(raidz_row_t, rr_col[acols]), KM_SLEEP);
+ rm->rm_row[0] = rr;
- rm->rm_cols = acols;
- rm->rm_scols = scols;
- rm->rm_bigcols = bc;
- rm->rm_skipstart = bc;
- rm->rm_missingdata = 0;
- rm->rm_missingparity = 0;
- rm->rm_firstdatacol = nparity;
- rm->rm_abd_copy = NULL;
- rm->rm_reports = 0;
- rm->rm_freed = 0;
- rm->rm_ecksuminjected = 0;
+ rr->rr_cols = acols;
+ rr->rr_missingdata = 0;
+ rr->rr_missingparity = 0;
+ rr->rr_firstdatacol = nparity;
+ rr->rr_abd_copy = NULL;
asize = 0;
- for (c = 0; c < scols; c++) {
+ for (c = 0; c < acols; c++) {
col = f + c;
coff = o;
if (col >= dcols) {
col -= dcols;
coff += 1ULL << unit_shift;
}
- rm->rm_col[c].rc_devidx = col;
- rm->rm_col[c].rc_offset = coff;
- rm->rm_col[c].rc_abd = NULL;
- rm->rm_col[c].rc_gdata = NULL;
- rm->rm_col[c].rc_error = 0;
- rm->rm_col[c].rc_tried = 0;
- rm->rm_col[c].rc_skipped = 0;
-
- if (c >= acols)
- rm->rm_col[c].rc_size = 0;
- else if (c < bc)
- rm->rm_col[c].rc_size = (q + 1) << unit_shift;
+ rr->rr_col[c].rc_devidx = col;
+ rr->rr_col[c].rc_offset = coff;
+ rr->rr_col[c].rc_abd = NULL;
+ rr->rr_col[c].rc_gdata = NULL;
+ rr->rr_col[c].rc_orig_data = NULL;
+ rr->rr_col[c].rc_error = 0;
+ rr->rr_col[c].rc_tried = 0;
+ rr->rr_col[c].rc_skipped = 0;
+ rr->rr_col[c].rc_need_orig_restore = B_FALSE;
+
+ if (c < bc)
+ rr->rr_col[c].rc_size = (q + 1) << unit_shift;
else
- rm->rm_col[c].rc_size = q << unit_shift;
+ rr->rr_col[c].rc_size = q << unit_shift;
- asize += rm->rm_col[c].rc_size;
+ asize += rr->rr_col[c].rc_size;
}
ASSERT3U(asize, ==, tot << unit_shift);
- rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
rm->rm_nskip = roundup(tot, nparity + 1) - tot;
- ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
- ASSERT3U(rm->rm_nskip, <=, nparity);
if (!dofree) {
- for (c = 0; c < rm->rm_firstdatacol; c++) {
- rm->rm_col[c].rc_abd =
- abd_alloc_linear(rm->rm_col[c].rc_size, B_TRUE);
- }
+ for (c = 0; c < rr->rr_firstdatacol; c++)
+ rr->rr_col[c].rc_abd =
+ abd_alloc_linear(rr->rr_col[c].rc_size, B_TRUE);
- rm->rm_col[c].rc_abd = abd_get_offset(abd, 0);
- off = rm->rm_col[c].rc_size;
+ rr->rr_col[c].rc_abd = abd_get_offset(abd, 0);
+ off = rr->rr_col[c].rc_size;
for (c = c + 1; c < acols; c++) {
- rm->rm_col[c].rc_abd = abd_get_offset(abd, off);
- off += rm->rm_col[c].rc_size;
+ rr->rr_col[c].rc_abd = abd_get_offset(abd, off);
+ off += rr->rr_col[c].rc_size;
}
}
@@ -596,20 +625,182 @@
* skip the first column since at least one data and one parity
* column must appear in each row.
*/
- ASSERT(rm->rm_cols >= 2);
- ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
+ ASSERT(rr->rr_cols >= 2);
+ ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size);
+
+ if (rr->rr_firstdatacol == 1 && (offset & (1ULL << 20))) {
+ devidx = rr->rr_col[0].rc_devidx;
+ o = rr->rr_col[0].rc_offset;
+ rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx;
+ rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset;
+ rr->rr_col[1].rc_devidx = devidx;
+ rr->rr_col[1].rc_offset = o;
+ }
+
+ return (rm);
+}
+
+static raidz_map_t *
+vdev_raidz_map_alloc_expanded(abd_t *abd, uint64_t size, uint64_t offset, boolean_t dofree,
+ uint64_t ashift, uint64_t physical_cols, uint64_t logical_cols,
+ uint64_t nparity)
+{
+ /* The starting RAIDZ (parent) vdev sector of the block. */
+ uint64_t b = offset >> ashift;
+ /* The zio's size in units of the vdev's minimum sector size. */
+ uint64_t s = size >> ashift;
+ uint64_t cur_col = b % physical_cols;
+ /* The starting byte offset on each child vdev. */
+ uint64_t child_offset = (b / physical_cols) << ashift;
+ uint64_t q, r, bc, devidx, asize, tot;
+
+ /*
+ * "Quotient": The number of data sectors for this stripe on all but
+ * the "big column" child vdevs that also contain "remainder" data.
+ * AKA "full rows"
+ */
+ q = s / (logical_cols - nparity);
+
+ /*
+ * "Remainder": The number of partial stripe data sectors in this I/O.
+ * This will add a sector to some, but not all, child vdevs.
+ */
+ r = s - q * (logical_cols - nparity);
+
+ /* The number of "big columns" - those which contain remainder data. */
+ bc = (r == 0 ? 0 : r + nparity);
+
+ /*
+ * The total number of data and parity sectors associated with
+ * this I/O.
+ */
+ tot = s + nparity * (q + (r == 0 ? 0 : 1));
+
+ /* How many rows contain data (not skip) */
+ uint64_t rows = howmany(tot, logical_cols);
+ int cols = MIN(tot, logical_cols);
+
+ raidz_map_t *rm =
+ kmem_zalloc(offsetof(raidz_map_t, rm_row[rows]),
+ KM_SLEEP);
+ rm->rm_nrows = rows;
+ rm->rm_nskip = roundup(tot, nparity + 1) - tot;
+ asize = 0;
- if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
- devidx = rm->rm_col[0].rc_devidx;
- o = rm->rm_col[0].rc_offset;
- rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
- rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
- rm->rm_col[1].rc_devidx = devidx;
- rm->rm_col[1].rc_offset = o;
+ zfs_dbgmsg("rm=%p s=%d q=%d r=%d bc=%d nrows=%d cols=%d",
+ rm, (int)s, (int)q, (int)r, (int)bc, (int)rows, (int)cols);
+
+ for (uint64_t row = 0; row < rows; row++) {
+ raidz_row_t *rr = kmem_alloc(offsetof(raidz_row_t,
+ rr_col[cols]), KM_SLEEP);
+ rm->rm_row[row] = rr;
+
+ /*
+ * We set cols to the entire width of the block, even
+ * if this row is shorter. This is needed because parity
+ * generation (for Q and R) needs to know the entire width,
+ * because it treats the short row as though it was
+ * full-width (and the "phantom" sectors were zero-filled).
+ *
+ * Another approach to this would be to set cols shorter
+ * (to just the number of columns that we might do i/o to)
+ * and have another mechanism to tell the parity generation
+ * about the "entire width". Reconstruction (at least
+ * vdev_raidz_reconstruct_general()) would also need to
+ * know about the "entire width".
+ */
+ rr->rr_cols = cols;
+ rr->rr_missingdata = 0;
+ rr->rr_missingparity = 0;
+ rr->rr_firstdatacol = nparity;
+ rr->rr_abd_copy = NULL;
+
+ for (int c = 0; c < rr->rr_cols; c++, cur_col++) {
+ if (cur_col >= physical_cols) {
+ cur_col -= physical_cols;
+ child_offset += 1ULL << ashift;
+ }
+ rr->rr_col[c].rc_devidx = cur_col;
+ rr->rr_col[c].rc_offset = child_offset;
+ rr->rr_col[c].rc_gdata = NULL;
+ rr->rr_col[c].rc_orig_data = NULL;
+ rr->rr_col[c].rc_error = 0;
+ rr->rr_col[c].rc_tried = 0;
+ rr->rr_col[c].rc_skipped = 0;
+ rr->rr_col[c].rc_abd = NULL;
+ rr->rr_col[c].rc_need_orig_restore = B_FALSE;
+
+ uint64_t dc = c - rr->rr_firstdatacol;
+ if (c < rr->rr_firstdatacol) {
+ rr->rr_col[c].rc_size = 1ULL << ashift;
+ if (!dofree) {
+ rr->rr_col[c].rc_abd =
+ abd_alloc_linear(rr->rr_col[c].rc_size,
+ B_TRUE);
+ }
+ } else if (row == rows - 1 && bc != 0 && c >= bc) {
+ /*
+ * Past the end, this for parity generation.
+ */
+ rr->rr_col[c].rc_size = 0;
+ rr->rr_col[c].rc_abd = NULL;
+ } else {
+ /* XXX ASCII art diagram here */
+ /* "data column" (col excluding parity) */
+ uint64_t off;
+
+ if (c < bc || r == 0) {
+ off = dc * rows + row;
+ } else {
+ off = r * rows +
+ (dc - r) * (rows - 1) + row;
+ }
+ zfs_dbgmsg("rm=%p row=%d c=%d dc=%d off=%u devidx=%u",
+ rm, (int)row, (int)c, (int)dc, (int)off, (int)cur_col);
+ rr->rr_col[c].rc_size = 1ULL << ashift;
+ if (!dofree) {
+ rr->rr_col[c].rc_abd =
+ abd_get_offset(abd, off << ashift);
+ }
+ }
+
+ asize += rr->rr_col[c].rc_size;
+ }
+
+ /*
+ * If all data stored spans all columns, there's a danger that parity
+ * will always be on the same device and, since parity isn't read
+ * during normal operation, that that device's I/O bandwidth won't be
+ * used effectively. We therefore switch the parity every 1MB.
+ *
+ * ... at least that was, ostensibly, the theory. As a practical
+ * matter unless we juggle the parity between all devices evenly, we
+ * won't see any benefit. Further, occasional writes that aren't a
+ * multiple of the LCM of the number of children and the minimum
+ * stripe width are sufficient to avoid pessimal behavior.
+ * Unfortunately, this decision created an implicit on-disk format
+ * requirement that we need to support for all eternity, but only
+ * for single-parity RAID-Z.
+ *
+ * If we intend to skip a sector in the zeroth column for padding
+ * we must make sure to note this swap. We will never intend to
+ * skip the first column since at least one data and one parity
+ * column must appear in each row.
+ */
+ if (rr->rr_firstdatacol == 1 && rr->rr_cols > 1 &&
+ (offset & (1ULL << 20))) {
+ ASSERT(rr->rr_cols >= 2);
+ ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size);
+ devidx = rr->rr_col[0].rc_devidx;
+ uint64_t o = rr->rr_col[0].rc_offset;
+ rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx;
+ rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset;
+ rr->rr_col[1].rc_devidx = devidx;
+ rr->rr_col[1].rc_offset = o;
+ }
- if (rm->rm_skipstart == 0)
- rm->rm_skipstart = 1;
}
+ ASSERT3U(asize, ==, tot << ashift);
return (rm);
}
@@ -676,55 +867,48 @@
}
static void
-vdev_raidz_generate_parity_p(raidz_map_t *rm)
+vdev_raidz_generate_parity_p(raidz_row_t *rr)
{
- uint64_t *p;
- int c;
- abd_t *src;
+ uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
- for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
- src = rm->rm_col[c].rc_abd;
- p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
+ for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
+ abd_t *src = rr->rr_col[c].rc_abd;
- if (c == rm->rm_firstdatacol) {
- abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
+ if (c == rr->rr_firstdatacol) {
+ abd_copy_to_buf(p, src, rr->rr_col[c].rc_size);
} else {
struct pqr_struct pqr = { p, NULL, NULL };
- (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
+ (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size,
vdev_raidz_p_func, &pqr);
}
}
}
static void
-vdev_raidz_generate_parity_pq(raidz_map_t *rm)
+vdev_raidz_generate_parity_pq(raidz_row_t *rr)
{
- uint64_t *p, *q, pcnt, ccnt, mask, i;
- int c;
- abd_t *src;
+ uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
+ uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
+ uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
+ ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size ==
+ rr->rr_col[VDEV_RAIDZ_Q].rc_size);
- pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
- ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
- rm->rm_col[VDEV_RAIDZ_Q].rc_size);
+ for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
+ abd_t *src = rr->rr_col[c].rc_abd;
- for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
- src = rm->rm_col[c].rc_abd;
- p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
- q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
+ uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]);
- ccnt = rm->rm_col[c].rc_size / sizeof (p[0]);
-
- if (c == rm->rm_firstdatacol) {
- abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
- (void) memcpy(q, p, rm->rm_col[c].rc_size);
+ if (c == rr->rr_firstdatacol) {
+ abd_copy_to_buf(p, src, rr->rr_col[c].rc_size);
+ (void) memcpy(q, p, rr->rr_col[c].rc_size);
} else {
struct pqr_struct pqr = { p, q, NULL };
- (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
+ (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size,
vdev_raidz_pq_func, &pqr);
}
- if (c == rm->rm_firstdatacol) {
- for (i = ccnt; i < pcnt; i++) {
+ if (c == rr->rr_firstdatacol) {
+ for (uint64_t i = ccnt; i < pcnt; i++) {
p[i] = 0;
q[i] = 0;
}
@@ -733,7 +917,8 @@
* Treat short columns as though they are full of 0s.
* Note that there's therefore nothing needed for P.
*/
- for (i = ccnt; i < pcnt; i++) {
+ uint64_t mask;
+ for (uint64_t i = ccnt; i < pcnt; i++) {
VDEV_RAIDZ_64MUL_2(q[i], mask);
}
}
@@ -741,38 +926,35 @@
}
static void
-vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
+vdev_raidz_generate_parity_pqr(raidz_row_t *rr)
{
- uint64_t *p, *q, *r, pcnt, ccnt, mask, i;
- int c;
- abd_t *src;
-
- pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
- ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
- rm->rm_col[VDEV_RAIDZ_Q].rc_size);
- ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
- rm->rm_col[VDEV_RAIDZ_R].rc_size);
-
- for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
- src = rm->rm_col[c].rc_abd;
- p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
- q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
- r = abd_to_buf(rm->rm_col[VDEV_RAIDZ_R].rc_abd);
-
- ccnt = rm->rm_col[c].rc_size / sizeof (p[0]);
-
- if (c == rm->rm_firstdatacol) {
- abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
- (void) memcpy(q, p, rm->rm_col[c].rc_size);
- (void) memcpy(r, p, rm->rm_col[c].rc_size);
+ uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
+ uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
+ uint64_t *r = abd_to_buf(rr->rr_col[VDEV_RAIDZ_R].rc_abd);
+ uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
+ ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size ==
+ rr->rr_col[VDEV_RAIDZ_Q].rc_size);
+ ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size ==
+ rr->rr_col[VDEV_RAIDZ_R].rc_size);
+
+ for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
+ abd_t *src = rr->rr_col[c].rc_abd;
+
+ uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]);
+
+ if (c == rr->rr_firstdatacol) {
+ abd_copy_to_buf(p, src, rr->rr_col[c].rc_size);
+ (void) memcpy(q, p, rr->rr_col[c].rc_size);
+ (void) memcpy(r, p, rr->rr_col[c].rc_size);
} else {
struct pqr_struct pqr = { p, q, r };
- (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
+ (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size,
vdev_raidz_pqr_func, &pqr);
}
- if (c == rm->rm_firstdatacol) {
- for (i = ccnt; i < pcnt; i++) {
+ if (c == rr->rr_firstdatacol) {
+ for (uint64_t i = ccnt; i < pcnt; i++) {
+ /* XXX does this really happen? firstdatacol should be the same size as the parity cols */
p[i] = 0;
q[i] = 0;
r[i] = 0;
@@ -782,7 +964,8 @@
* Treat short columns as though they are full of 0s.
* Note that there's therefore nothing needed for P.
*/
- for (i = ccnt; i < pcnt; i++) {
+ uint64_t mask;
+ for (uint64_t i = ccnt; i < pcnt; i++) {
VDEV_RAIDZ_64MUL_2(q[i], mask);
VDEV_RAIDZ_64MUL_4(r[i], mask);
}
@@ -795,17 +978,27 @@
* parity columns available.
*/
static void
-vdev_raidz_generate_parity(raidz_map_t *rm)
+vdev_raidz_generate_parity(raidz_row_t *rr)
{
- switch (rm->rm_firstdatacol) {
+ if (rr->rr_cols == 0) {
+ /*
+ * We are handling this block one row at a time (because
+ * this block has a different logical vs physical width,
+ * due to RAIDZ expansion), and this is a pad-only row,
+ * which has no parity.
+ */
+ return;
+ }
+
+ switch (rr->rr_firstdatacol) {
case 1:
- vdev_raidz_generate_parity_p(rm);
+ vdev_raidz_generate_parity_p(rr);
break;
case 2:
- vdev_raidz_generate_parity_pq(rm);
+ vdev_raidz_generate_parity_pq(rr);
break;
case 3:
- vdev_raidz_generate_parity_pqr(rm);
+ vdev_raidz_generate_parity_pqr(rr);
break;
default:
cmn_err(CE_PANIC, "invalid RAID-Z configuration");
@@ -929,30 +1122,31 @@
}
static int
-vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
+vdev_raidz_reconstruct_p(raidz_row_t *rr, int *tgts, int ntgts)
{
int x = tgts[0];
- int c;
abd_t *dst, *src;
- ASSERT(ntgts == 1);
- ASSERT(x >= rm->rm_firstdatacol);
- ASSERT(x < rm->rm_cols);
+ zfs_dbgmsg("reconstruct_p(rm=%p x=%u)",
+ rr, x);
+
+ ASSERT3U(ntgts, ==, 1);
+ ASSERT3U(x, >=, rr->rr_firstdatacol);
+ ASSERT3U(x, <, rr->rr_cols);
- ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_P].rc_size);
- ASSERT(rm->rm_col[x].rc_size > 0);
+ ASSERT3U(rr->rr_col[x].rc_size, <=, rr->rr_col[VDEV_RAIDZ_P].rc_size);
- src = rm->rm_col[VDEV_RAIDZ_P].rc_abd;
- dst = rm->rm_col[x].rc_abd;
+ src = rr->rr_col[VDEV_RAIDZ_P].rc_abd;
+ dst = rr->rr_col[x].rc_abd;
- abd_copy(dst, src, rm->rm_col[x].rc_size);
+ abd_copy(dst, src, rr->rr_col[x].rc_size);
- for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
- uint64_t size = MIN(rm->rm_col[x].rc_size,
- rm->rm_col[c].rc_size);
+ for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
+ uint64_t size = MIN(rr->rr_col[x].rc_size,
+ rr->rr_col[c].rc_size);
- src = rm->rm_col[c].rc_abd;
- dst = rm->rm_col[x].rc_abd;
+ src = rr->rr_col[c].rc_abd;
+ dst = rr->rr_col[x].rc_abd; /* XXX not needed, done above */
if (c == x)
continue;
@@ -965,51 +1159,54 @@
}
static int
-vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
+vdev_raidz_reconstruct_q(raidz_row_t *rr, int *tgts, int ntgts)
{
int x = tgts[0];
int c, exp;
abd_t *dst, *src;
+ zfs_dbgmsg("reconstruct_q(rm=%p x=%u)",
+ rr, x);
+
ASSERT(ntgts == 1);
- ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_Q].rc_size);
+ ASSERT(rr->rr_col[x].rc_size <= rr->rr_col[VDEV_RAIDZ_Q].rc_size);
- for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
- uint64_t size = (c == x) ? 0 : MIN(rm->rm_col[x].rc_size,
- rm->rm_col[c].rc_size);
+ for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
+ uint64_t size = (c == x) ? 0 : MIN(rr->rr_col[x].rc_size,
+ rr->rr_col[c].rc_size);
- src = rm->rm_col[c].rc_abd;
- dst = rm->rm_col[x].rc_abd;
+ src = rr->rr_col[c].rc_abd;
+ dst = rr->rr_col[x].rc_abd;
- if (c == rm->rm_firstdatacol) {
+ if (c == rr->rr_firstdatacol) {
abd_copy(dst, src, size);
- if (rm->rm_col[x].rc_size > size)
+ if (rr->rr_col[x].rc_size > size)
abd_zero_off(dst, size,
- rm->rm_col[x].rc_size - size);
+ rr->rr_col[x].rc_size - size);
} else {
- ASSERT3U(size, <=, rm->rm_col[x].rc_size);
+ ASSERT3U(size, <=, rr->rr_col[x].rc_size);
(void) abd_iterate_func2(dst, src, 0, 0, size,
vdev_raidz_reconst_q_pre_func, NULL);
(void) abd_iterate_func(dst,
- size, rm->rm_col[x].rc_size - size,
+ size, rr->rr_col[x].rc_size - size,
vdev_raidz_reconst_q_pre_tail_func, NULL);
}
}
- src = rm->rm_col[VDEV_RAIDZ_Q].rc_abd;
- dst = rm->rm_col[x].rc_abd;
- exp = 255 - (rm->rm_cols - 1 - x);
+ src = rr->rr_col[VDEV_RAIDZ_Q].rc_abd;
+ dst = rr->rr_col[x].rc_abd;
+ exp = 255 - (rr->rr_cols - 1 - x);
struct reconst_q_struct rq = { abd_to_buf(src), exp };
- (void) abd_iterate_func(dst, 0, rm->rm_col[x].rc_size,
+ (void) abd_iterate_func(dst, 0, rr->rr_col[x].rc_size,
vdev_raidz_reconst_q_post_func, &rq);
return (1 << VDEV_RAIDZ_Q);
}
static int
-vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
+vdev_raidz_reconstruct_pq(raidz_row_t *rr, int *tgts, int ntgts)
{
uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp;
abd_t *pdata, *qdata;
@@ -1018,12 +1215,15 @@
int y = tgts[1];
abd_t *xd, *yd;
+ zfs_dbgmsg("reconstruct_pq(rm=%p x=%u y=%u)",
+ rr, x, y);
+
ASSERT(ntgts == 2);
ASSERT(x < y);
- ASSERT(x >= rm->rm_firstdatacol);
- ASSERT(y < rm->rm_cols);
+ ASSERT(x >= rr->rr_firstdatacol);
+ ASSERT(y < rr->rr_cols);
- ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
+ ASSERT(rr->rr_col[x].rc_size >= rr->rr_col[y].rc_size);
/*
* Move the parity data aside -- we're going to compute parity as
@@ -1032,29 +1232,29 @@
* parity so we make those columns appear to be full of zeros by
* setting their lengths to zero.
*/
- pdata = rm->rm_col[VDEV_RAIDZ_P].rc_abd;
- qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_abd;
- xsize = rm->rm_col[x].rc_size;
- ysize = rm->rm_col[y].rc_size;
+ pdata = rr->rr_col[VDEV_RAIDZ_P].rc_abd;
+ qdata = rr->rr_col[VDEV_RAIDZ_Q].rc_abd;
+ xsize = rr->rr_col[x].rc_size;
+ ysize = rr->rr_col[y].rc_size;
- rm->rm_col[VDEV_RAIDZ_P].rc_abd =
- abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_P].rc_size, B_TRUE);
- rm->rm_col[VDEV_RAIDZ_Q].rc_abd =
- abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_Q].rc_size, B_TRUE);
- rm->rm_col[x].rc_size = 0;
- rm->rm_col[y].rc_size = 0;
+ rr->rr_col[VDEV_RAIDZ_P].rc_abd =
+ abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_P].rc_size, B_TRUE);
+ rr->rr_col[VDEV_RAIDZ_Q].rc_abd =
+ abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_Q].rc_size, B_TRUE);
+ rr->rr_col[x].rc_size = 0;
+ rr->rr_col[y].rc_size = 0;
- vdev_raidz_generate_parity_pq(rm);
+ vdev_raidz_generate_parity_pq(rr);
- rm->rm_col[x].rc_size = xsize;
- rm->rm_col[y].rc_size = ysize;
+ rr->rr_col[x].rc_size = xsize;
+ rr->rr_col[y].rc_size = ysize;
p = abd_to_buf(pdata);
q = abd_to_buf(qdata);
- pxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
- qxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
- xd = rm->rm_col[x].rc_abd;
- yd = rm->rm_col[y].rc_abd;
+ pxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
+ qxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
+ xd = rr->rr_col[x].rc_abd;
+ yd = rr->rr_col[y].rc_abd;
/*
* We now have:
@@ -1072,7 +1272,7 @@
*/
a = vdev_raidz_pow2[255 + x - y];
- b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
+ b = vdev_raidz_pow2[255 - (rr->rr_cols - 1 - x)];
tmp = 255 - vdev_raidz_log2[a ^ 1];
aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
@@ -1085,14 +1285,14 @@
(void) abd_iterate_func(xd, ysize, xsize - ysize,
vdev_raidz_reconst_pq_tail_func, &rpq);
- abd_free(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
- abd_free(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
+ abd_free(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
+ abd_free(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
/*
* Restore the saved parity data.
*/
- rm->rm_col[VDEV_RAIDZ_P].rc_abd = pdata;
- rm->rm_col[VDEV_RAIDZ_Q].rc_abd = qdata;
+ rr->rr_col[VDEV_RAIDZ_P].rc_abd = pdata;
+ rr->rr_col[VDEV_RAIDZ_Q].rc_abd = qdata;
return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
}
@@ -1249,13 +1449,13 @@
/* END CSTYLED */
static void
-vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
+vdev_raidz_matrix_init(raidz_row_t *rr, int n, int nmap, int *map,
uint8_t **rows)
{
int i, j;
int pow;
- ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
+ ASSERT(n == rr->rr_cols - rr->rr_firstdatacol);
/*
* Fill in the missing rows of interest.
@@ -1279,7 +1479,7 @@
}
static void
-vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
+vdev_raidz_matrix_invert(raidz_row_t *rr, int n, int nmissing, int *missing,
uint8_t **rows, uint8_t **invrows, const uint8_t *used)
{
int i, j, ii, jj;
@@ -1291,10 +1491,10 @@
* correspond to data columns.
*/
for (i = 0; i < nmissing; i++) {
- ASSERT3S(used[i], <, rm->rm_firstdatacol);
+ ASSERT3S(used[i], <, rr->rr_firstdatacol);
}
for (; i < n; i++) {
- ASSERT3S(used[i], >=, rm->rm_firstdatacol);
+ ASSERT3S(used[i], >=, rr->rr_firstdatacol);
}
/*
@@ -1311,8 +1511,8 @@
*/
for (i = 0; i < nmissing; i++) {
for (j = nmissing; j < n; j++) {
- ASSERT3U(used[j], >=, rm->rm_firstdatacol);
- jj = used[j] - rm->rm_firstdatacol;
+ ASSERT3U(used[j], >=, rr->rr_firstdatacol);
+ jj = used[j] - rr->rr_firstdatacol;
ASSERT3S(jj, <, n);
invrows[i][j] = rows[i][jj];
rows[i][jj] = 0;
@@ -1373,7 +1573,7 @@
}
static void
-vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
+vdev_raidz_matrix_reconstruct(raidz_row_t *rr, int n, int nmissing,
int *missing, uint8_t **invrows, const uint8_t *used)
{
int i, j, x, cc, c;
@@ -1405,22 +1605,24 @@
for (i = 0; i < n; i++) {
c = used[i];
- ASSERT3U(c, <, rm->rm_cols);
+ ASSERT3U(c, <, rr->rr_cols);
- src = abd_to_buf(rm->rm_col[c].rc_abd);
- ccount = rm->rm_col[c].rc_size;
+ ccount = rr->rr_col[c].rc_size;
+ ASSERT(ccount >= rr->rr_col[missing[0]].rc_size || i > 0);
+ if (ccount == 0)
+ continue;
+ src = abd_to_buf(rr->rr_col[c].rc_abd);
for (j = 0; j < nmissing; j++) {
- cc = missing[j] + rm->rm_firstdatacol;
- ASSERT3U(cc, >=, rm->rm_firstdatacol);
- ASSERT3U(cc, <, rm->rm_cols);
+ cc = missing[j] + rr->rr_firstdatacol;
+ ASSERT3U(cc, >=, rr->rr_firstdatacol);
+ ASSERT3U(cc, <, rr->rr_cols);
ASSERT3U(cc, !=, c);
- dst[j] = abd_to_buf(rm->rm_col[cc].rc_abd);
- dcount[j] = rm->rm_col[cc].rc_size;
+ dcount[j] = rr->rr_col[cc].rc_size;
+ if (dcount[j] != 0)
+ dst[j] = abd_to_buf(rr->rr_col[cc].rc_abd);
}
- ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
-
for (x = 0; x < ccount; x++, src++) {
if (*src != 0)
log = vdev_raidz_log2[*src];
@@ -1449,13 +1651,15 @@
}
static int
-vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
+vdev_raidz_reconstruct_general(raidz_row_t *rr, int *tgts, int ntgts)
{
int n, i, c, t, tt;
int nmissing_rows;
int missing_rows[VDEV_RAIDZ_MAXPARITY];
int parity_map[VDEV_RAIDZ_MAXPARITY];
+ zfs_dbgmsg("reconstruct_general(rm=%p ntgts=%u)",
+ rr, ntgts);
uint8_t *p, *pp;
size_t psize;
@@ -1471,28 +1675,31 @@
* Matrix reconstruction can't use scatter ABDs yet, so we allocate
* temporary linear ABDs.
*/
- if (!abd_is_linear(rm->rm_col[rm->rm_firstdatacol].rc_abd)) {
- bufs = kmem_alloc(rm->rm_cols * sizeof (abd_t *), KM_PUSHPAGE);
+ if (!abd_is_linear(rr->rr_col[rr->rr_firstdatacol].rc_abd)) {
+ bufs = kmem_alloc(rr->rr_cols * sizeof (abd_t *), KM_PUSHPAGE);
- for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
- raidz_col_t *col = &rm->rm_col[c];
+ for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
+ raidz_col_t *col = &rr->rr_col[c];
bufs[c] = col->rc_abd;
- col->rc_abd = abd_alloc_linear(col->rc_size, B_TRUE);
- abd_copy(col->rc_abd, bufs[c], col->rc_size);
+ if (bufs[c] != NULL) {
+ col->rc_abd =
+ abd_alloc_linear(col->rc_size, B_TRUE);
+ abd_copy(col->rc_abd, bufs[c], col->rc_size);
+ }
}
}
- n = rm->rm_cols - rm->rm_firstdatacol;
+ n = rr->rr_cols - rr->rr_firstdatacol;
/*
* Figure out which data columns are missing.
*/
nmissing_rows = 0;
for (t = 0; t < ntgts; t++) {
- if (tgts[t] >= rm->rm_firstdatacol) {
+ if (tgts[t] >= rr->rr_firstdatacol) {
missing_rows[nmissing_rows++] =
- tgts[t] - rm->rm_firstdatacol;
+ tgts[t] - rr->rr_firstdatacol;
}
}
@@ -1502,7 +1709,7 @@
*/
for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
ASSERT(tt < ntgts);
- ASSERT(c < rm->rm_firstdatacol);
+ ASSERT(c < rr->rr_firstdatacol);
/*
* Skip any targeted parity columns.
@@ -1537,9 +1744,9 @@
used[i] = parity_map[i];
}
- for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
+ for (tt = 0, c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
if (tt < nmissing_rows &&
- c == missing_rows[tt] + rm->rm_firstdatacol) {
+ c == missing_rows[tt] + rr->rr_firstdatacol) {
tt++;
continue;
}
@@ -1552,18 +1759,18 @@
/*
* Initialize the interesting rows of the matrix.
*/
- vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
+ vdev_raidz_matrix_init(rr, n, nmissing_rows, parity_map, rows);
/*
* Invert the matrix.
*/
- vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
+ vdev_raidz_matrix_invert(rr, n, nmissing_rows, missing_rows, rows,
invrows, used);
/*
* Reconstruct the missing data using the generated matrix.
*/
- vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
+ vdev_raidz_matrix_reconstruct(rr, n, nmissing_rows, missing_rows,
invrows, used);
kmem_free(p, psize);
@@ -1572,21 +1779,23 @@
* copy back from temporary linear abds and free them
*/
if (bufs) {
- for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
- raidz_col_t *col = &rm->rm_col[c];
+ for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
+ raidz_col_t *col = &rr->rr_col[c];
- abd_copy(bufs[c], col->rc_abd, col->rc_size);
- abd_free(col->rc_abd);
+ if (bufs[c] != NULL) {
+ abd_copy(bufs[c], col->rc_abd, col->rc_size);
+ abd_free(col->rc_abd);
+ }
col->rc_abd = bufs[c];
}
- kmem_free(bufs, rm->rm_cols * sizeof (abd_t *));
+ kmem_free(bufs, rr->rr_cols * sizeof (abd_t *));
}
return (code);
}
static int
-vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
+vdev_raidz_reconstruct_row(raidz_row_t *rr, int *t, int nt)
{
int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
int ntgts;
@@ -1595,26 +1804,37 @@
int nbadparity, nbaddata;
int parity_valid[VDEV_RAIDZ_MAXPARITY];
+ zfs_dbgmsg("reconstruct(rm=%p nt=%u cols=%u md=%u mp=%u)",
+ rr, nt, (int)rr->rr_cols, (int)rr->rr_missingdata, (int)rr->rr_missingparity);
+
/*
* The tgts list must already be sorted.
*/
+ zfs_dbgmsg("reconstruct(rm=%p t[%u]=%u)", rr, 0, t[0]);
for (i = 1; i < nt; i++) {
+ zfs_dbgmsg("reconstruct(rm=%p t[%u]=%u)",
+ rr, i, t[i]);
ASSERT(t[i] > t[i - 1]);
}
- nbadparity = rm->rm_firstdatacol;
- nbaddata = rm->rm_cols - nbadparity;
+ nbadparity = rr->rr_firstdatacol;
+ nbaddata = rr->rr_cols - nbadparity;
ntgts = 0;
- for (i = 0, c = 0; c < rm->rm_cols; c++) {
- if (c < rm->rm_firstdatacol)
+ for (i = 0, c = 0; c < rr->rr_cols; c++) {
+ zfs_dbgmsg("reconstruct(rm=%p col=%u devid=%u offset=%llx error=%u)",
+ rr, c,
+ (int)rr->rr_col[c].rc_devidx,
+ (long long)rr->rr_col[c].rc_offset,
+ (int)rr->rr_col[c].rc_error);
+ if (c < rr->rr_firstdatacol)
parity_valid[c] = B_FALSE;
if (i < nt && c == t[i]) {
tgts[ntgts++] = c;
i++;
- } else if (rm->rm_col[c].rc_error != 0) {
+ } else if (rr->rr_col[c].rc_error != 0) {
tgts[ntgts++] = c;
- } else if (c >= rm->rm_firstdatacol) {
+ } else if (c >= rr->rr_firstdatacol) {
nbaddata--;
} else {
parity_valid[c] = B_TRUE;
@@ -1635,30 +1855,30 @@
switch (nbaddata) {
case 1:
if (parity_valid[VDEV_RAIDZ_P])
- return (vdev_raidz_reconstruct_p(rm, dt, 1));
+ return (vdev_raidz_reconstruct_p(rr, dt, 1));
- ASSERT(rm->rm_firstdatacol > 1);
+ ASSERT(rr->rr_firstdatacol > 1);
if (parity_valid[VDEV_RAIDZ_Q])
- return (vdev_raidz_reconstruct_q(rm, dt, 1));
+ return (vdev_raidz_reconstruct_q(rr, dt, 1));
- ASSERT(rm->rm_firstdatacol > 2);
+ ASSERT(rr->rr_firstdatacol > 2);
break;
case 2:
- ASSERT(rm->rm_firstdatacol > 1);
+ ASSERT(rr->rr_firstdatacol > 1);
if (parity_valid[VDEV_RAIDZ_P] &&
parity_valid[VDEV_RAIDZ_Q])
- return (vdev_raidz_reconstruct_pq(rm, dt, 2));
+ return (vdev_raidz_reconstruct_pq(rr, dt, 2));
- ASSERT(rm->rm_firstdatacol > 2);
+ ASSERT(rr->rr_firstdatacol > 2);
break;
}
}
- code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
+ code = vdev_raidz_reconstruct_general(rr, tgts, ntgts);
ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
ASSERT(code > 0);
return (code);
@@ -1668,8 +1888,8 @@
vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
uint64_t *logical_ashift, uint64_t *physical_ashift)
{
- vdev_t *cvd;
- uint64_t nparity = vd->vdev_nparity;
+ vdev_raidz_t *vdrz = vd->vdev_tsd;
+ uint64_t nparity = vdrz->vd_nparity;
int c;
int lasterror = 0;
int numerrors = 0;
@@ -1685,7 +1905,7 @@
vdev_open_children(vd);
for (c = 0; c < vd->vdev_children; c++) {
- cvd = vd->vdev_child[c];
+ vdev_t *cvd = vd->vdev_child[c];
if (cvd->vdev_open_error != 0) {
lasterror = cvd->vdev_open_error;
@@ -1786,9 +2006,10 @@
vdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size,
uint64_t offset, uint64_t origoffset, boolean_t doread, boolean_t isdump)
{
+ vdev_raidz_t *vdrz = vd->vdev_tsd;
vdev_t *tvd = vd->vdev_top;
vdev_t *cvd;
- raidz_map_t *rm;
+ raidz_row_t *rr;
raidz_col_t *rc;
int c, err = 0;
@@ -1818,15 +2039,19 @@
*/
abd_t *abd = abd_get_from_buf(data - (offset - origoffset),
SPA_OLD_MAXBLOCKSIZE);
- rm = vdev_raidz_map_alloc(abd,
+ /*
+ * XXX deal with dump to expanded raidz
+ */
+ raidz_map_t *rm = vdev_raidz_map_alloc(abd,
SPA_OLD_MAXBLOCKSIZE, origoffset, B_FALSE, tvd->vdev_ashift,
- vd->vdev_children, vd->vdev_nparity);
+ vd->vdev_children, vdrz->vd_nparity);
+ rr = rm->rm_row[0];
coloffset = origoffset;
- for (c = rm->rm_firstdatacol; c < rm->rm_cols;
+ for (c = rr->rr_firstdatacol; c < rr->rr_cols;
c++, coloffset += rc->rc_size) {
- rc = &rm->rm_col[c];
+ rc = &rr->rr_col[c];
cvd = vd->vdev_child[rc->rc_devidx];
/*
@@ -1863,7 +2088,7 @@
break;
}
- vdev_raidz_map_free(rm);
+ vdev_raidz_row_free(rr);
abd_put(abd);
#endif /* KERNEL */
@@ -1874,10 +2099,11 @@
static uint64_t
vdev_raidz_asize(vdev_t *vd, uint64_t psize)
{
+ vdev_raidz_t *vdrz = vd->vdev_tsd;
uint64_t asize;
uint64_t ashift = vd->vdev_top->vdev_ashift;
- uint64_t cols = vd->vdev_children;
- uint64_t nparity = vd->vdev_nparity;
+ uint64_t cols = vdrz->vd_logical_width;
+ uint64_t nparity = vdrz->vd_nparity;
asize = ((psize - 1) >> ashift) + 1;
asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
@@ -1896,119 +2122,137 @@
rc->rc_skipped = 0;
}
-/*
- * Start an IO operation on a RAIDZ VDev
- *
- * Outline:
- * - For write operations:
- * 1. Generate the parity data
- * 2. Create child zio write operations to each column's vdev, for both
- * data and parity.
- * 3. If the column skips any sectors for padding, create optional dummy
- * write zio children for those areas to improve aggregation continuity.
- * - For read operations:
- * 1. Create child zio read operations to each data column's vdev to read
- * the range of data required for zio.
- * 2. If this is a scrub or resilver operation, or if any of the data
- * vdevs have had errors, then create zio read operations to the parity
- * columns' VDevs as well.
- */
static void
-vdev_raidz_io_start(zio_t *zio)
+vdev_raidz_io_verify(zio_t *zio, raidz_row_t *rr, int col)
{
+#if 0
+#ifdef ZFS_DEBUG
vdev_t *vd = zio->io_vd;
vdev_t *tvd = vd->vdev_top;
- vdev_t *cvd;
- raidz_map_t *rm;
- raidz_col_t *rc;
- int c, i;
- rm = vdev_raidz_map_alloc(zio->io_abd, zio->io_size, zio->io_offset,
- zio->io_type == ZIO_TYPE_FREE,
- tvd->vdev_ashift, vd->vdev_children,
- vd->vdev_nparity);
+ range_seg_t logical_rs, physical_rs;
+ logical_rs.rs_start = zio->io_offset;
+ logical_rs.rs_end = logical_rs.rs_start +
+ vdev_raidz_asize(zio->io_vd, zio->io_size);
- zio->io_vsd = rm;
- zio->io_vsd_ops = &vdev_raidz_vsd_ops;
+ raidz_col_t *rc = &rr->rr_col[col];
+ vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
- ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
+ vdev_xlate(cvd, &logical_rs, &physical_rs);
+ ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start);
+ ASSERT3U(rc->rc_offset, <, physical_rs.rs_end);
+ /*
+ * It would be nice to assert that rs_end is equal
+ * to rc_offset + rc_size but there might be an
+ * optional I/O at the end that is not accounted in
+ * rc_size.
+ */
+ if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) {
+ ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset +
+ rc->rc_size + (1 << tvd->vdev_ashift));
+ } else {
+ ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size);
+ }
+#endif
+#endif
+}
- if (zio->io_type == ZIO_TYPE_FREE) {
- for (c = 0; c < rm->rm_cols; c++) {
- rc = &rm->rm_col[c];
- cvd = vd->vdev_child[rc->rc_devidx];
- zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
- rc->rc_offset, rc->rc_abd, rc->rc_size,
- zio->io_type, zio->io_priority, 0,
- vdev_raidz_child_done, rc));
- }
+static void
+vdev_raidz_io_start_free(zio_t *zio, raidz_row_t *rr)
+{
+ vdev_t *vd = zio->io_vd;
+ vdev_t *tvd = vd->vdev_top;
- zio_execute(zio);
- return;
+ for (int c = 0; c < rr->rr_cols; c++) {
+ raidz_col_t *rc = &rr->rr_col[c];
+ if (rc->rc_size == 0)
+ continue;
+ vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
+
+ zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
+ rc->rc_offset, rc->rc_abd, rc->rc_size,
+ zio->io_type, zio->io_priority, 0,
+ vdev_raidz_child_done, rc));
}
+}
- if (zio->io_type == ZIO_TYPE_WRITE) {
- vdev_raidz_generate_parity(rm);
+static void
+vdev_raidz_io_start_write(zio_t *zio, raidz_row_t *rr)
+{
+ vdev_t *vd = zio->io_vd;
+ vdev_t *tvd = vd->vdev_top;
- for (c = 0; c < rm->rm_cols; c++) {
- rc = &rm->rm_col[c];
- cvd = vd->vdev_child[rc->rc_devidx];
- zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
- rc->rc_offset, rc->rc_abd, rc->rc_size,
- zio->io_type, zio->io_priority, 0,
- vdev_raidz_child_done, rc));
- }
+ vdev_raidz_generate_parity(rr);
- /*
- * Generate optional I/Os for any skipped sectors to improve
- * aggregation contiguity.
- */
- for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
- ASSERT(c <= rm->rm_scols);
- if (c == rm->rm_scols)
- c = 0;
- rc = &rm->rm_col[c];
- cvd = vd->vdev_child[rc->rc_devidx];
- zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
- rc->rc_offset + rc->rc_size, NULL,
- 1 << tvd->vdev_ashift,
- zio->io_type, zio->io_priority,
- ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
- }
+ for (int c = 0; c < rr->rr_cols; c++) {
+ raidz_col_t *rc = &rr->rr_col[c];
+ if (rc->rc_size == 0)
+ continue;
+ vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
- zio_execute(zio);
- return;
+ zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
+ rc->rc_offset, rc->rc_abd, rc->rc_size,
+ zio->io_type, zio->io_priority, 0,
+ vdev_raidz_child_done, rc));
+ }
+
+ /* XXX do this in vdev_raidz_io_start, based on nskip stored in rm
+ */
+#if 0
+ /*
+ * Generate optional I/Os for any skipped sectors to improve
+ * aggregation contiguity.
+ */
+ for (int c = rr->rm_skipstart, i = 0; i < rr->rm_nskip; c++, i++) {
+ ASSERT(c <= rr->rm_scols);
+ if (c == rr->rm_scols)
+ c = 0;
+ raidz_col_t *rc = &rr->rr_col[c];
+ vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
+ zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
+ rc->rc_offset + rc->rc_size, NULL,
+ 1 << tvd->vdev_ashift,
+ zio->io_type, zio->io_priority,
+ ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
}
+#endif
+}
- ASSERT(zio->io_type == ZIO_TYPE_READ);
+static void
+vdev_raidz_io_start_read(zio_t *zio, raidz_row_t *rr, boolean_t forceparity)
+{
+ vdev_t *vd = zio->io_vd;
/*
* Iterate over the columns in reverse order so that we hit the parity
* last -- any errors along the way will force us to read the parity.
*/
- for (c = rm->rm_cols - 1; c >= 0; c--) {
- rc = &rm->rm_col[c];
- cvd = vd->vdev_child[rc->rc_devidx];
+ for (int c = rr->rr_cols - 1; c >= 0; c--) {
+ raidz_col_t *rc = &rr->rr_col[c];
+ if (rc->rc_size == 0)
+ continue;
+ vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
if (!vdev_readable(cvd)) {
- if (c >= rm->rm_firstdatacol)
- rm->rm_missingdata++;
+ if (c >= rr->rr_firstdatacol)
+ rr->rr_missingdata++;
else
- rm->rm_missingparity++;
+ rr->rr_missingparity++;
rc->rc_error = SET_ERROR(ENXIO);
rc->rc_tried = 1; /* don't even try */
rc->rc_skipped = 1;
continue;
}
if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
- if (c >= rm->rm_firstdatacol)
- rm->rm_missingdata++;
+ if (c >= rr->rr_firstdatacol)
+ rr->rr_missingdata++;
else
- rm->rm_missingparity++;
+ rr->rr_missingparity++;
rc->rc_error = SET_ERROR(ESTALE);
rc->rc_skipped = 1;
continue;
}
- if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
+ if (forceparity ||
+ c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 ||
(zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
rc->rc_offset, rc->rc_abd, rc->rc_size,
@@ -2016,6 +2260,75 @@
vdev_raidz_child_done, rc));
}
}
+}
+
+/*
+ * Start an IO operation on a RAIDZ VDev
+ *
+ * Outline:
+ * - For write operations:
+ * 1. Generate the parity data
+ * 2. Create child zio write operations to each column's vdev, for both
+ * data and parity.
+ * 3. If the column skips any sectors for padding, create optional dummy
+ * write zio children for those areas to improve aggregation continuity.
+ * - For read operations:
+ * 1. Create child zio read operations to each data column's vdev to read
+ * the range of data required for zio.
+ * 2. If this is a scrub or resilver operation, or if any of the data
+ * vdevs have had errors, then create zio read operations to the parity
+ * columns' VDevs as well.
+ */
+static void
+vdev_raidz_io_start(zio_t *zio)
+{
+ vdev_t *vd = zio->io_vd;
+ vdev_t *tvd = vd->vdev_top;
+ vdev_raidz_t *vdrz = vd->vdev_tsd;
+ raidz_map_t *rm;
+
+ ASSERT(!vdrz->vn_expanding);
+
+ if (vdrz->vd_logical_width != vdrz->vd_physical_width) {
+ rm = vdev_raidz_map_alloc_expanded(zio->io_abd,
+ zio->io_size, zio->io_offset,
+ zio->io_type == ZIO_TYPE_FREE,
+ tvd->vdev_ashift, vdrz->vd_physical_width,
+ vdrz->vd_logical_width, vdrz->vd_nparity);
+ } else {
+ rm = vdev_raidz_map_alloc(zio->io_abd,
+ zio->io_size, zio->io_offset,
+ zio->io_type == ZIO_TYPE_FREE,
+ tvd->vdev_ashift, vdrz->vd_logical_width,
+ vdrz->vd_nparity);
+ }
+
+ zio->io_vsd = rm;
+ zio->io_vsd_ops = &vdev_raidz_vsd_ops;
+
+ if (zio->io_type == ZIO_TYPE_FREE) {
+ for (int i = 0; i < rm->rm_nrows; i++) {
+ vdev_raidz_io_start_free(zio, rm->rm_row[i]);
+ }
+ } else if (zio->io_type == ZIO_TYPE_WRITE) {
+ for (int i = 0; i < rm->rm_nrows; i++) {
+ vdev_raidz_io_start_write(zio,
+ rm->rm_row[i]);
+ }
+ } else {
+ ASSERT(zio->io_type == ZIO_TYPE_READ);
+ /*
+ * If there are multiple rows, we will be hitting
+ * all disks, so go ahead and read the parity so
+ * that we are reading in decent size chunks.
+ * XXX maybe doesn't really matter?
+ */
+ boolean_t forceparity = rm->rm_nrows > 1;
+ for (int i = 0; i < rm->rm_nrows; i++) {
+ vdev_raidz_io_start_read(zio,
+ rm->rm_row[i], forceparity);
+ }
+ }
zio_execute(zio);
}
@@ -2070,10 +2383,10 @@
* Generate the parity from the data columns. If we tried and were able to
* read the parity without error, verify that the generated parity matches the
* data we read. If it doesn't, we fire off a checksum error. Return the
- * number such failures.
+ * number of such failures.
*/
static int
-raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
+raidz_parity_verify(zio_t *zio, raidz_row_t *rr)
{
void *orig[VDEV_RAIDZ_MAXPARITY];
int c, ret = 0;
@@ -2086,21 +2399,29 @@
if (checksum == ZIO_CHECKSUM_NOPARITY)
return (ret);
- for (c = 0; c < rm->rm_firstdatacol; c++) {
- rc = &rm->rm_col[c];
+ for (c = 0; c < rr->rr_firstdatacol; c++) {
+ rc = &rr->rr_col[c];
if (!rc->rc_tried || rc->rc_error != 0)
continue;
orig[c] = zio_buf_alloc(rc->rc_size);
abd_copy_to_buf(orig[c], rc->rc_abd, rc->rc_size);
}
- vdev_raidz_generate_parity(rm);
+ /* XXX regenerates parity even for !tried||rc_error!=0
+ * This could cause a side effect of fixing stuff we didn't realize
+ * was necessary (i.e. even if we return 0)
+ */
+ vdev_raidz_generate_parity(rr);
+
+ for (c = 0; c < rr->rr_firstdatacol; c++) {
+ rc = &rr->rr_col[c];
- for (c = 0; c < rm->rm_firstdatacol; c++) {
- rc = &rm->rm_col[c];
if (!rc->rc_tried || rc->rc_error != 0)
continue;
+
if (abd_cmp_buf(rc->rc_abd, orig[c], rc->rc_size) != 0) {
+ zfs_dbgmsg("raidz_parity_verify found error on col=%u devidx=%u",
+ c, (int)rc->rc_devidx);
raidz_checksum_error(zio, rc, orig[c]);
rc->rc_error = SET_ERROR(ECKSUM);
ret++;
@@ -2117,16 +2438,83 @@
static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
static int
-vdev_raidz_worst_error(raidz_map_t *rm)
+vdev_raidz_worst_error(raidz_row_t *rr)
{
int error = 0;
- for (int c = 0; c < rm->rm_cols; c++)
- error = zio_worst_error(error, rm->rm_col[c].rc_error);
+ for (int c = 0; c < rr->rr_cols; c++)
+ error = zio_worst_error(error, rr->rr_col[c].rc_error);
return (error);
}
+static void
+vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr)
+{
+ int unexpected_errors = 0;
+ int parity_errors = 0;
+ int parity_untried = 0;
+ int data_errors = 0;
+
+ ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
+
+ for (int c = 0; c < rr->rr_cols; c++) {
+ raidz_col_t *rc = &rr->rr_col[c];
+
+ if (rc->rc_error) {
+ if (c < rr->rr_firstdatacol)
+ parity_errors++;
+ else
+ data_errors++;
+
+ if (!rc->rc_skipped)
+ unexpected_errors++;
+ } else if (c < rr->rr_firstdatacol && !rc->rc_tried) {
+ parity_untried++;
+ }
+ }
+
+ /*
+ * If we read more parity disks than were used for
+ * reconstruction, confirm that the other parity disks produced
+ * correct data.
+ *
+ * Note that we also regenerate parity when resilvering so we
+ * can write it out to failed devices later.
+ */
+ zfs_dbgmsg("parity_errors=%u parity_untried=%u data_errors=%u verifying=%s",
+ parity_errors, parity_untried, data_errors,
+ (parity_errors + parity_untried < rr->rr_firstdatacol - data_errors) ? "yes" : "no");
+ if (parity_errors + parity_untried <
+ rr->rr_firstdatacol - data_errors ||
+ (zio->io_flags & ZIO_FLAG_RESILVER)) {
+ int n = raidz_parity_verify(zio, rr);
+ unexpected_errors += n;
+ ASSERT3U(parity_errors + n, <=, rr->rr_firstdatacol);
+ }
+
+ if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
+ (unexpected_errors > 0 || (zio->io_flags & ZIO_FLAG_RESILVER))) {
+ /*
+ * Use the good data we have in hand to repair damaged children.
+ */
+ for (int c = 0; c < rr->rr_cols; c++) {
+ raidz_col_t *rc = &rr->rr_col[c];
+ vdev_t *vd = zio->io_vd;
+ vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
+
+ if (rc->rc_error == 0 || rc->rc_size == 0)
+ continue;
+
+ zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
+ rc->rc_offset, rc->rc_abd, rc->rc_size,
+ ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
+ ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
+ ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
+ }
+ }
+}
+
/*
* Iterate over all combinations of bad data and attempt a reconstruction.
* Note that the algorithm below is non-optimal because it doesn't take into
@@ -2134,454 +2522,771 @@
* triple-parity RAID-Z the reconstruction procedure is the same if column 4
* is targeted as invalid as if columns 1 and 4 are targeted since in both
* cases we'd only use parity information in column 0.
+ *
+ * The order that we find the various possible combinations of failed
+ * disks is dictated by these rules:
+ * - Examine each "slot" (the "i" in tgts[i])
+ * - Try to increment this slot (tgts[i] = tgts[i] + 1)
+ * - if we can't increment because it runs into the next slot,
+ * reset our slot to the minimum, and examine the next slot
+ * For example, with a 6-wide RAIDZ3, and no known errors (so we have to choose
+ * 3 columns to reconstruct), we will generate the following sequence:
+ *
+ * STATE ACTION
+ * 0 1 2 special case: skip since these are all parity
+ * 0 1 3 first slot: reset to 0; middle slot: increment to 2
+ * 0 2 3 first slot: increment to 1
+ * 1 2 3 first: reset to 0; middle: reset to 1; last: increment to 4
+ * 0 1 4 first: reset to 0; middle: increment to 2
+ * 0 2 4 first: increment to 1
+ * 1 2 4 first: reset to 0; middle: increment to 3
+ * 0 3 4 first: increment to 1
+ * 1 3 4 first: increment to 2
+ * 2 3 4 first: reset to 0; middle: reset to 1; last: increment to 5
+ * 0 1 5 first: reset to 0; middle: increment to 2
+ * 0 2 5 first: increment to 1
+ * 1 2 5 first: reset to 0; middle: increment to 3
+ * 0 3 5 first: increment to 1
+ * 1 3 5 first: increment to 2
+ * 2 3 5 first: reset to 0; middle: increment to 4
+ * 0 4 5 first: increment to 1
+ * 1 4 5 first: increment to 2
+ * 2 4 5 first: increment to 3
+ * 3 4 5 done
*/
-static int
-vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
-{
- raidz_map_t *rm = zio->io_vsd;
- raidz_col_t *rc;
- void *orig[VDEV_RAIDZ_MAXPARITY];
- int tstore[VDEV_RAIDZ_MAXPARITY + 2];
- int *tgts = &tstore[1];
- int current, next, i, c, n;
- int code, ret = 0;
- ASSERT(total_errors < rm->rm_firstdatacol);
+/*
+ * Should this sector be considered failed for logical child ID i?
+ * XXX comment explaining logical child ID's
+ */
+static boolean_t
+raidz_simulate_failure(vdev_raidz_t *vdrz, int ashift, int i, raidz_col_t *rc)
+{
+ uint64_t sector_id =
+ vdrz->vd_physical_width * (rc->rc_offset >> ashift) +
+ rc->rc_devidx;
+
+#if 0
+ zfs_dbgmsg("raidz_simulate_failure(pw=%u lw=%u ashift=%u i=%u rc_offset=%llx rc_devidx=%u sector_id=%u",
+ vdrz->vd_physical_width,
+ vdrz->vd_logical_width,
+ ashift,
+ i,
+ (long long)rc->rc_offset,
+ (int)rc->rc_devidx,
+ (long long)sector_id);
+#endif
- /*
- * This simplifies one edge condition.
- */
- tgts[-1] = -1;
+ for (int w = vdrz->vd_physical_width;
+ w >= vdrz->vd_logical_width; w--) {
+ if (i < w) {
+ return (sector_id % w == i);
+ } else {
+ i -= w;
+ }
+ }
+ ASSERT(!"invalid logical child id");
+ return (B_FALSE);
+}
- for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
- /*
- * Initialize the targets array by finding the first n columns
- * that contain no error.
- *
- * If there were no data errors, we need to ensure that we're
- * always explicitly attempting to reconstruct at least one
- * data column. To do this, we simply push the highest target
- * up into the data columns.
- */
- for (c = 0, i = 0; i < n; i++) {
- if (i == n - 1 && data_errors == 0 &&
- c < rm->rm_firstdatacol) {
- c = rm->rm_firstdatacol;
+static void
+raidz_restore_orig_data(raidz_map_t *rm)
+{
+ for (int i = 0; i < rm->rm_nrows; i++) {
+ raidz_row_t *rr = rm->rm_row[i];
+ for (int c = 0; c < rr->rr_cols; c++) {
+ raidz_col_t *rc = &rr->rr_col[c];
+ if (rc->rc_need_orig_restore) {
+ abd_copy_from_buf(rc->rc_abd,
+ rc->rc_orig_data, rc->rc_size);
+ rc->rc_need_orig_restore = B_FALSE;
}
+ }
+ }
+}
- while (rm->rm_col[c].rc_error != 0) {
- c++;
- ASSERT3S(c, <, rm->rm_cols);
+/*
+ * returns EINVAL if reconstruction of the block will not be possible
+ * returns ECKSUM if this specific reconstruction failed
+ * returns 0 on successful reconstruction
+ */
+static int
+raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts)
+{
+ raidz_map_t *rm = zio->io_vsd;
+ vdev_raidz_t *vdrz = zio->io_vd->vdev_tsd;
+
+ zfs_dbgmsg("raidz_reconstruct_expanded(zio=%p ltgts=%u,%u,%u ntgts=%u",
+ zio, ltgts[0], ltgts[1], ltgts[2], ntgts);
+
+ /* Reconstruct each row */
+ for (int r = 0; r < rm->rm_nrows; r++) {
+ raidz_row_t *rr = rm->rm_row[r];
+ int my_tgts[VDEV_RAIDZ_MAXPARITY]; /* value is child id */
+ int t = 0;
+ int dead = 0;
+ int dead_data = 0;
+
+ zfs_dbgmsg("raidz_reconstruct_expanded(row=%u)",
+ r);
+
+ for (int c = 0; c < rr->rr_cols; c++) {
+ raidz_col_t *rc = &rr->rr_col[c];
+ ASSERT0(rc->rc_need_orig_restore);
+ if (rc->rc_error != 0) {
+ dead++;
+ if (c >= vdrz->vd_nparity)
+ dead_data++;
+ continue;
+ }
+ if (rc->rc_size == 0)
+ continue;
+ for (int lt = 0; lt < ntgts; lt++) {
+ if (raidz_simulate_failure(vdrz,
+ zio->io_vd->vdev_top->vdev_ashift,
+ ltgts[lt], rc)) {
+ if (rc->rc_orig_data == NULL) {
+ rc->rc_orig_data =
+ zio_buf_alloc(rc->rc_size);
+ abd_copy_to_buf(rc->rc_orig_data,
+ rc->rc_abd, rc->rc_size);
+ }
+ rc->rc_need_orig_restore = B_TRUE;
+
+ dead++;
+ if (c >= vdrz->vd_nparity)
+ dead_data++;
+ my_tgts[t++] = c;
+ zfs_dbgmsg("simulating failure of col %u devidx %u",
+ c, (int)rc->rc_devidx);
+ break;
+ }
}
-
- tgts[i] = c++;
}
-
- /*
- * Setting tgts[n] simplifies the other edge condition.
- */
- tgts[n] = rm->rm_cols;
-
- /*
- * These buffers were allocated in previous iterations.
- */
- for (i = 0; i < n - 1; i++) {
- ASSERT(orig[i] != NULL);
+ if (dead > vdrz->vd_nparity) {
+ /* reconstruction not possible */
+ zfs_dbgmsg("reconstruction not possible; too many failures");
+ raidz_restore_orig_data(rm);
+ return (EINVAL);
}
+ rr->rr_code = 0;
+ if (dead_data > 0)
+ rr->rr_code = vdev_raidz_reconstruct_row(rr, my_tgts, t);
+ }
- orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
+ /* Check for success */
+ if (raidz_checksum_verify(zio) == 0) {
+
+ /* Reconstruction succeeded - report errors */
+ for (int i = 0; i < rm->rm_nrows; i++) {
+ raidz_row_t *rr = rm->rm_row[i];
+ atomic_inc_64(&raidz_corrected[rr->rr_code]);
+
+ for (int c = 0; c < rr->rr_cols; c++) {
+ raidz_col_t *rc = &rr->rr_col[c];
+ if (rc->rc_need_orig_restore) {
+ /*
+ * Note: if this is a parity column,
+ * we don't really know if it's wrong.
+ * We need to let
+ * vdev_raidz_io_done_verified() check
+ * it, and if we set rc_error, it will
+ * think that it is a "known" error
+ * that doesn't need to be checked
+ * or corrected.
+ */
+ if (rc->rc_error == 0 && c >= rr->rr_firstdatacol) {
+ raidz_checksum_error(zio, rc, rc->rc_gdata);
+ rc->rc_error = SET_ERROR(ECKSUM);
+ }
+ rc->rc_need_orig_restore = B_FALSE;
+ }
+ }
- current = 0;
- next = tgts[current];
+ vdev_raidz_io_done_verified(zio, rr);
+ }
- while (current != n) {
- tgts[current] = next;
- current = 0;
+ zio_checksum_verified(zio);
- /*
- * Save off the original data that we're going to
- * attempt to reconstruct.
- */
- for (i = 0; i < n; i++) {
- ASSERT(orig[i] != NULL);
- c = tgts[i];
- ASSERT3S(c, >=, 0);
- ASSERT3S(c, <, rm->rm_cols);
- rc = &rm->rm_col[c];
- abd_copy_to_buf(orig[i], rc->rc_abd,
- rc->rc_size);
- }
+ zfs_dbgmsg("reconstruction successful (checksum verified)");
+ return (0);
+ }
- /*
- * Attempt a reconstruction and exit the outer loop on
- * success.
- */
- code = vdev_raidz_reconstruct(rm, tgts, n);
- if (raidz_checksum_verify(zio) == 0) {
- atomic_inc_64(&raidz_corrected[code]);
-
- for (i = 0; i < n; i++) {
- c = tgts[i];
- rc = &rm->rm_col[c];
- ASSERT(rc->rc_error == 0);
- if (rc->rc_tried)
- raidz_checksum_error(zio, rc,
- orig[i]);
- rc->rc_error = SET_ERROR(ECKSUM);
- }
+ /* Reconstruction failed - restore original data */
+ raidz_restore_orig_data(rm);
+ zfs_dbgmsg("raidz_reconstruct_expanded(zio=%p) checksum failed",
+ zio);
+ return (ECKSUM);
+}
- ret = code;
- goto done;
- }
+/*
+ * return 0 on success, ECKSUM on failure
+ */
+static int
+vdev_raidz_combrec(zio_t *zio)
+{
+ raidz_map_t *rm = zio->io_vsd;
+ vdev_raidz_t *vdrz = zio->io_vd->vdev_tsd;
+
+ for (int num_failures = 1; num_failures <= vdrz->vd_nparity;
+ num_failures++) {
+ int tstore[VDEV_RAIDZ_MAXPARITY + 2];
+ int *ltgts = &tstore[1]; /* value is logical child ID */
+
+ /* Determine number of logical children, n */
+ int n = 0;
+ for (int w = vdrz->vd_physical_width;
+ w >= vdrz->vd_logical_width; w--) {
+ n += w;
+ }
- /*
- * Restore the original data.
- */
- for (i = 0; i < n; i++) {
- c = tgts[i];
- rc = &rm->rm_col[c];
- abd_copy_from_buf(rc->rc_abd, orig[i],
- rc->rc_size);
- }
+ ASSERT3U(num_failures, <=, vdrz->vd_nparity);
+ ASSERT3U(num_failures, <=, VDEV_RAIDZ_MAXPARITY);
+ /* handle corner cases in combrec logic */
+ ltgts[-1] = -1;
+ for (int i = 0; i < num_failures; i++) {
+ ltgts[i] = i;
+ }
+ ltgts[num_failures] = n;
- do {
+ for (;;) {
+ int err = raidz_reconstruct(zio,
+ ltgts, num_failures);
+ if (err == EINVAL) {
/*
- * Find the next valid column after the current
- * position..
+ * Reconstruction not possible with this #
+ * failures; try more failures.
*/
- for (next = tgts[current] + 1;
- next < rm->rm_cols &&
- rm->rm_col[next].rc_error != 0; next++)
- continue;
+ break;
+ } else if (err == 0)
+ return (0);
+
+ /* Compute next targets to try */
+ for (int t = 0; ; t++) {
+ ASSERT3U(t, <, num_failures);
+ ltgts[t]++;
+ if (ltgts[t] == n) {
+ ASSERT3U(t, ==, num_failures - 1);
+ zfs_dbgmsg("reconstruction failed for num_failures=%u; tried all combinations",
+ num_failures);
+ break; // try more failures
+ }
- ASSERT(next <= tgts[current + 1]);
+ ASSERT3U(ltgts[t], <, n);
+ ASSERT3U(ltgts[t], <=, ltgts[t + 1]);
/*
* If that spot is available, we're done here.
*/
- if (next != tgts[current + 1])
- break;
+ if (ltgts[t] != ltgts[t + 1])
+ break; // found next combination
/*
- * Otherwise, find the next valid column after
- * the previous position.
+ * Otherwise, reset this tgt to the minimum,
+ * and move on to the next tgt.
*/
- for (c = tgts[current - 1] + 1;
- rm->rm_col[c].rc_error != 0; c++)
- continue;
-
- tgts[current] = c;
- current++;
-
- } while (current != n);
+ ltgts[t] = ltgts[t - 1] + 1;
+ ASSERT3U(ltgts[t], ==, t);
+ }
+ if (ltgts[num_failures - 1] == n)
+ break; // try more failures
}
}
- n--;
-done:
- for (i = 0; i < n; i++) {
- zio_buf_free(orig[i], rm->rm_col[0].rc_size);
- }
-
- return (ret);
+ zfs_dbgmsg("reconstruction failed for all num_failures");
+ return (ECKSUM);
}
/*
- * Complete an IO operation on a RAIDZ VDev
+ * Complete a write IO operation on a RAIDZ VDev
*
* Outline:
- * - For write operations:
* 1. Check for errors on the child IOs.
* 2. Return, setting an error code if too few child VDevs were written
* to reconstruct the data later. Note that partial writes are
* considered successful if they can be reconstructed at all.
- * - For read operations:
- * 1. Check for errors on the child IOs.
- * 2. If data errors occurred:
- * a. Try to reassemble the data from the parity available.
- * b. If we haven't yet read the parity drives, read them now.
- * c. If all parity drives have been read but the data still doesn't
- * reassemble with a correct checksum, then try combinatorial
- * reconstruction.
- * d. If that doesn't work, return an error.
- * 3. If there were unexpected errors or this is a resilver operation,
- * rewrite the vdevs that had errors.
*/
static void
-vdev_raidz_io_done(zio_t *zio)
+vdev_raidz_io_done_write_impl(zio_t *zio, raidz_row_t *rr)
+{
+ int total_errors = 0;
+
+ ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol);
+ ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol);
+ ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
+
+ for (int c = 0; c < rr->rr_cols; c++) {
+ raidz_col_t *rc = &rr->rr_col[c];
+
+ if (rc->rc_error) {
+ ASSERT(rc->rc_error != ECKSUM); /* child has no bp */
+
+ total_errors++;
+ }
+ }
+
+ /*
+ * XXX -- for now, treat partial writes as a success.
+ * (If we couldn't write enough columns to reconstruct
+ * the data, the I/O failed. Otherwise, good enough.)
+ *
+ * Now that we support write reallocation, it would be better
+ * to treat partial failure as real failure unless there are
+ * no non-degraded top-level vdevs left, and not update DTLs
+ * if we intend to reallocate.
+ */
+ /* XXPOLICY */
+ if (total_errors > rr->rr_firstdatacol) {
+ zio->io_error = zio_worst_error(zio->io_error,
+ vdev_raidz_worst_error(rr));
+ }
+}
+
+/*
+ * return 0 if no reconstruction occurred, otherwise the "code" from
+ * vdev_raidz_reconstruct().
+ */
+static int
+vdev_raidz_io_done_reconstruct_known_missing(zio_t *zio, raidz_row_t *rr)
{
- vdev_t *vd = zio->io_vd;
- vdev_t *cvd;
- raidz_map_t *rm = zio->io_vsd;
- raidz_col_t *rc;
- int unexpected_errors = 0;
int parity_errors = 0;
int parity_untried = 0;
int data_errors = 0;
int total_errors = 0;
- int n, c;
- int tgts[VDEV_RAIDZ_MAXPARITY];
- int code;
-
- ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */
+ int code = 0;
- ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
- ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
+ ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol);
+ ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol);
+ ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
- for (c = 0; c < rm->rm_cols; c++) {
- rc = &rm->rm_col[c];
+ for (int c = 0; c < rr->rr_cols; c++) {
+ raidz_col_t *rc = &rr->rr_col[c];
if (rc->rc_error) {
ASSERT(rc->rc_error != ECKSUM); /* child has no bp */
- if (c < rm->rm_firstdatacol)
+ if (c < rr->rr_firstdatacol)
parity_errors++;
else
data_errors++;
- if (!rc->rc_skipped)
- unexpected_errors++;
-
total_errors++;
- } else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
+ } else if (c < rr->rr_firstdatacol && !rc->rc_tried) {
parity_untried++;
}
}
- if (zio->io_type == ZIO_TYPE_WRITE) {
+ /*
+ * If there were data errors and the number of errors we saw was
+ * correctable -- less than or equal to the number of parity disks read
+ * -- reconstruct based on the missing data.
+ */
+ if (data_errors != 0 &&
+ total_errors <= rr->rr_firstdatacol - parity_untried) {
/*
- * XXX -- for now, treat partial writes as a success.
- * (If we couldn't write enough columns to reconstruct
- * the data, the I/O failed. Otherwise, good enough.)
- *
- * Now that we support write reallocation, it would be better
- * to treat partial failure as real failure unless there are
- * no non-degraded top-level vdevs left, and not update DTLs
- * if we intend to reallocate.
+ * We either attempt to read all the parity columns or
+ * none of them. If we didn't try to read parity, we
+ * wouldn't be here in the correctable case. There must
+ * also have been fewer parity errors than parity
+ * columns or, again, we wouldn't be in this code path.
*/
- /* XXPOLICY */
- if (total_errors > rm->rm_firstdatacol)
- zio->io_error = vdev_raidz_worst_error(rm);
+ ASSERT(parity_untried == 0);
+ ASSERT(parity_errors < rr->rr_firstdatacol);
- return;
- } else if (zio->io_type == ZIO_TYPE_FREE) {
- return;
+ /*
+ * Identify the data columns that reported an error.
+ */
+ int n = 0;
+ int tgts[VDEV_RAIDZ_MAXPARITY];
+ for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
+ raidz_col_t *rc = &rr->rr_col[c];
+ if (rc->rc_error != 0) {
+ ASSERT(n < VDEV_RAIDZ_MAXPARITY);
+ tgts[n++] = c;
+ }
+ }
+
+ ASSERT(rr->rr_firstdatacol >= n);
+
+ code = vdev_raidz_reconstruct_row(rr, tgts, n);
}
- ASSERT(zio->io_type == ZIO_TYPE_READ);
- /*
- * There are three potential phases for a read:
- * 1. produce valid data from the columns read
- * 2. read all disks and try again
- * 3. perform combinatorial reconstruction
- *
- * Each phase is progressively both more expensive and less likely to
- * occur. If we encounter more errors than we can repair or all phases
- * fail, we have no choice but to return an error.
- */
+ return (code);
+}
- /*
- * If the number of errors we saw was correctable -- less than or equal
- * to the number of parity disks read -- attempt to produce data that
- * has a valid checksum. Naturally, this case applies in the absence of
- * any errors.
- */
- if (total_errors <= rm->rm_firstdatacol - parity_untried) {
- if (data_errors == 0) {
- if (raidz_checksum_verify(zio) == 0) {
- /*
- * If we read parity information (unnecessarily
- * as it happens since no reconstruction was
- * needed) regenerate and verify the parity.
- * We also regenerate parity when resilvering
- * so we can write it out to the failed device
- * later.
- */
- if (parity_errors + parity_untried <
- rm->rm_firstdatacol ||
- (zio->io_flags & ZIO_FLAG_RESILVER)) {
- n = raidz_parity_verify(zio, rm);
- unexpected_errors += n;
- ASSERT(parity_errors + n <=
- rm->rm_firstdatacol);
- }
- goto done;
+/*
+ * return the number of reads issued.
+ */
+static int
+vdev_raidz_read_all(zio_t *zio, raidz_row_t *rr)
+{
+ vdev_t *vd = zio->io_vd;
+ int nread = 0;
+
+ rr->rr_missingdata = 0;
+ rr->rr_missingparity = 0;
+
+ for (int c = 0; c < rr->rr_cols; c++) {
+ raidz_col_t *rc = &rr->rr_col[c];
+ if (rc->rc_tried || rc->rc_size == 0)
+ continue;
+
+ zio_nowait(zio_vdev_child_io(zio, NULL,
+ vd->vdev_child[rc->rc_devidx],
+ rc->rc_offset, rc->rc_abd, rc->rc_size,
+ zio->io_type, zio->io_priority, 0,
+ vdev_raidz_child_done, rc));
+ nread++;
+ }
+ return (nread);
+}
+
+static void
+vdev_raidz_io_done(zio_t *zio)
+{
+ raidz_map_t *rm = zio->io_vsd;
+ vdev_raidz_t *vdrz = zio->io_vd->vdev_tsd;
+
+ ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */
+ if (zio->io_type == ZIO_TYPE_WRITE) {
+ for (int i = 0; i < rm->rm_nrows; i++) {
+ vdev_raidz_io_done_write_impl(zio, rm->rm_row[i]);
+ }
+ } else if (zio->io_type == ZIO_TYPE_FREE) {
+ return;
+ } else {
+ for (int i = 0; i < rm->rm_nrows; i++) {
+ raidz_row_t *rr = rm->rm_row[i];
+ rr->rr_code =
+ vdev_raidz_io_done_reconstruct_known_missing(zio,
+ rr);
+ }
+
+ if (raidz_checksum_verify(zio) == 0) {
+ for (int i = 0; i < rm->rm_nrows; i++) {
+ raidz_row_t *rr = rm->rm_row[i];
+ atomic_inc_64(&raidz_corrected[rr->rr_code]);
+ vdev_raidz_io_done_verified(zio, rr);
}
+ zio_checksum_verified(zio);
} else {
/*
- * We either attempt to read all the parity columns or
- * none of them. If we didn't try to read parity, we
- * wouldn't be here in the correctable case. There must
- * also have been fewer parity errors than parity
- * columns or, again, we wouldn't be in this code path.
+ * This isn't a typical situation -- either we got a
+ * read error or a child silently returned bad data.
+ * Read every block so we can try again with as much
+ * data and parity as we can track down. If we've
+ * already been through once before, all children will
+ * be marked as tried so we'll proceed to combinatorial
+ * reconstruction.
*/
- ASSERT(parity_untried == 0);
- ASSERT(parity_errors < rm->rm_firstdatacol);
-
+ int nread = 0;
+ for (int i = 0; i < rm->rm_nrows; i++) {
+ nread += vdev_raidz_read_all(zio,
+ rm->rm_row[i]);
+ }
+ if (nread != 0) {
+ /*
+ * Normally our stage is VDEV_IO_DONE, but if
+ * we've already called redone(), it will have
+ * changed to VDEV_IO_START, in which case we
+ * don't want to call redone() again.
+ */
+ if (zio->io_stage != ZIO_STAGE_VDEV_IO_START)
+ zio_vdev_io_redone(zio);
+ return;
+ }
/*
- * Identify the data columns that reported an error.
+ * It would be too expensive to try every possible
+ * combination of failed sectors in every row, so
+ * instead we try every combination of failed current or
+ * past physical disk. This means that if the incorrect
+ * sectors were all on Nparity disks at any point in the
+ * past, we will find the correct data. I think that
+ * the only case where this is less durable than
+ * a non-expanded RAIDZ, is if we have a silent
+ * failure during expansion. In that case, one block
+ * could be partially in the old format and partially
+ * in the new format, so we'd lost some sectors
+ * from the old format and some from the new format.
+ *
+ * e.g. logical_width=4 physical_width=6
+ * the 15 (6+5+4) possible failed disks are:
+ * width=6 child=0
+ * width=6 child=1
+ * width=6 child=2
+ * width=6 child=3
+ * width=6 child=4
+ * width=6 child=5
+ * width=5 child=0
+ * width=5 child=1
+ * width=5 child=2
+ * width=5 child=3
+ * width=5 child=4
+ * width=4 child=0
+ * width=4 child=1
+ * width=4 child=2
+ * width=4 child=3
+ * And we will try every combination of Nparity of these
+ * failing.
+ *
+ * As a first pass, we can generate every combo,
+ * and try reconstructing, ignoring any known
+ * failures. If any row has too many known + simulated
+ * failures, then we bail on reconstructing with this
+ * number of simulated failures. As an improvement,
+ * we could detect the number of whole known failures
+ * (i.e. we have known failures on these disks for
+ * every row; the disks never succeeded), and
+ * subtract that from the max # failures to simulate.
+ * We could go even further like the current
+ * combrec code, but that doesn't seem like it
+ * gains us very much. If we simulate a failure
+ * that is also a known failure, that's fine.
*/
- n = 0;
- for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
- rc = &rm->rm_col[c];
- if (rc->rc_error != 0) {
- ASSERT(n < VDEV_RAIDZ_MAXPARITY);
- tgts[n++] = c;
- }
- }
-
- ASSERT(rm->rm_firstdatacol >= n);
-
- code = vdev_raidz_reconstruct(rm, tgts, n);
-
- if (raidz_checksum_verify(zio) == 0) {
- atomic_inc_64(&raidz_corrected[code]);
-
+ if (vdev_raidz_combrec(zio) != 0) {
/*
- * If we read more parity disks than were used
- * for reconstruction, confirm that the other
- * parity disks produced correct data. This
- * routine is suboptimal in that it regenerates
- * the parity that we already used in addition
- * to the parity that we're attempting to
- * verify, but this should be a relatively
- * uncommon case, and can be optimized if it
- * becomes a problem. Note that we regenerate
- * parity when resilvering so we can write it
- * out to failed devices later.
+ * We're here because either:
+ *
+ * total_errors == rm_first_datacol, or
+ * vdev_raidz_combrec() failed
+ *
+ * In either case, there is enough bad data to prevent
+ * reconstruction.
+ *
+ * Start checksum ereports for all children which haven't
+ * failed, and the IO wasn't speculative.
*/
- if (parity_errors < rm->rm_firstdatacol - n ||
- (zio->io_flags & ZIO_FLAG_RESILVER)) {
- n = raidz_parity_verify(zio, rm);
- unexpected_errors += n;
- ASSERT(parity_errors + n <=
- rm->rm_firstdatacol);
+ zio->io_error = SET_ERROR(ECKSUM);
+
+ if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
+ for (int i = 0; i < rm->rm_nrows; i++) {
+ raidz_row_t *rr = rm->rm_row[i];
+ for (int c = 0; c < rr->rr_cols; c++) {
+ raidz_col_t *rc = &rr->rr_col[c];
+ if (rc->rc_error == 0) {
+ zio_bad_cksum_t zbc;
+ zbc.zbc_has_cksum = 0;
+ zbc.zbc_injected =
+ rm->rm_ecksuminjected;
+
+ zfs_ereport_start_checksum(
+ zio->io_spa,
+ zio->io_vd->vdev_child[rc->rc_devidx],
+ zio, rc->rc_offset, rc->rc_size,
+ (void *)(uintptr_t)c, &zbc);
+ }
+ }
+ }
}
-
- goto done;
}
}
}
+ ASSERT(!vdrz->vn_expanding);
+}
+
+static void
+vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
+{
+ vdev_raidz_t *vdrz = vd->vdev_tsd;
+ if (faulted > vdrz->vd_nparity)
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_NO_REPLICAS);
+ else if (degraded + faulted != 0)
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
+ else
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
+}
+
+static void
+raidz_copy_range(void *arg, uint64_t start, uint64_t size)
+{
+ vdev_t *vd = arg;
+ int ashift = vd->vdev_top->vdev_ashift;
+ int old_children = vd->vdev_children - 1;
+ spa_t *spa = vd->vdev_spa;
+
+ ASSERT(IS_P2ALIGNED(start, 1 << ashift));
+ ASSERT(IS_P2ALIGNED(size, 1 << ashift));
+
+ abd_t *abd = abd_alloc_for_io(1 << ashift, B_FALSE);
+ for (uint64_t i = MAX(start >> ashift, old_children);
+ i < (start + size) >> ashift; i++) {
+ int child = i % old_children;
+ int offset = (i / old_children) << ashift;
+ spa_config_enter(spa, SCL_STATE, spa, RW_READER);
+ VERIFY0(zio_wait(zio_read_phys(NULL,
+ vd->vdev_child[child],
+ offset + VDEV_LABEL_START_SIZE,
+ 1 << ashift, abd,
+ ZIO_CHECKSUM_OFF, NULL, NULL,
+ ZIO_PRIORITY_REMOVAL, 0, B_FALSE)));
+
+ child = i % vd->vdev_children;
+ offset = (i / vd->vdev_children) << ashift;
+ VERIFY0(zio_wait(zio_write_phys(NULL,
+ vd->vdev_child[child],
+ offset + VDEV_LABEL_START_SIZE,
+ 1 << ashift, abd,
+ ZIO_CHECKSUM_OFF, NULL, NULL,
+ ZIO_PRIORITY_REMOVAL, 0, B_FALSE)));
+ spa_config_exit(spa, SCL_STATE, spa);
+ }
+ abd_free(abd);
+}
+
+void
+vdev_raidz_attach_sync(void *arg, dmu_tx_t *tx)
+{
+ vdev_t *new_child = arg;
+ spa_t *spa = new_child->vdev_spa;
+ vdev_t *raidvd = new_child->vdev_parent;
+ vdev_raidz_t *vdrz = raidvd->vdev_tsd;
+ ASSERT3P(raidvd->vdev_ops, ==, &vdev_raidz_ops);
+ ASSERT3P(raidvd->vdev_top, ==, raidvd);
+ ASSERT3U(raidvd->vdev_children, >, vdrz->vd_logical_width);
+ ASSERT3U(raidvd->vdev_children, ==, vdrz->vd_physical_width + 1);
/*
- * This isn't a typical situation -- either we got a read error or
- * a child silently returned bad data. Read every block so we can
- * try again with as much data and parity as we can track down. If
- * we've already been through once before, all children will be marked
- * as tried so we'll proceed to combinatorial reconstruction.
+ * XXX assuming that no other i/o takes place while this is happening,
+ * until we increment physical_width. But ZIL could do i/o.
*/
- unexpected_errors = 1;
- rm->rm_missingdata = 0;
- rm->rm_missingparity = 0;
+ vdrz->vn_expanding = B_TRUE;
- for (c = 0; c < rm->rm_cols; c++) {
- if (rm->rm_col[c].rc_tried)
- continue;
+ /*spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);*/
- zio_vdev_io_redone(zio);
- do {
- rc = &rm->rm_col[c];
- if (rc->rc_tried)
- continue;
- zio_nowait(zio_vdev_child_io(zio, NULL,
- vd->vdev_child[rc->rc_devidx],
- rc->rc_offset, rc->rc_abd, rc->rc_size,
- zio->io_type, zio->io_priority, 0,
- vdev_raidz_child_done, rc));
- } while (++c < rm->rm_cols);
+ range_tree_t *rt = range_tree_create(NULL, NULL);
- return;
+ for (uint64_t i = 0; i < raidvd->vdev_ms_count; i++) {
+ metaslab_t *msp = raidvd->vdev_ms[i];
+
+ /*vdev_initialize_ms_mark(msp);*/
+ mutex_enter(&msp->ms_lock);
+
+ metaslab_load_wait(msp);
+ if (!msp->ms_loaded)
+ VERIFY0(metaslab_load(msp));
+
+ /*
+ * We want to copy everything except the free (allocatable)
+ * space. Note that there may be a little bit more free
+ * space (e.g. in ms_defer), and it's fine to copy that too.
+ */
+ ASSERT(range_tree_is_empty(rt));
+ range_tree_add(rt, msp->ms_start, msp->ms_size);
+ range_tree_walk(msp->ms_allocatable, range_tree_remove, rt);
+ mutex_exit(&msp->ms_lock);
+
+ /*spa_config_exit(spa, SCL_CONFIG, FTAG);*/
+ /* Note, _vacate() doesn't visit in order */
+ range_tree_walk(rt, raidz_copy_range, raidvd);
+ range_tree_vacate(rt, NULL, NULL);
+ /*vdev_initialize_ms_unmark(msp);*/
+ /*spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);*/
}
+ /*spa_config_exit(spa, SCL_CONFIG, FTAG);*/
+ range_tree_destroy(rt);
+
+ vdrz->vd_physical_width++;
+
+#if 0
+ raidvd->vdev_expanding = B_TRUE;
+ vdev_reopen(raidvd);
+ raidvd->vdev_expanding = B_FALSE;
+#endif
+
+ vdrz->vn_expanding = B_FALSE;
+ /* Ensure that widths get written to label config */
+ vdev_config_dirty(raidvd);
+}
+
+/*
+ * Add RAIDZ-specific fields to the config nvlist.
+ * XXX add this to vdev_ops_t?
+ */
+void
+vdev_raidz_config_generate(vdev_t *vd, nvlist_t *nv)
+{
+ spa_t *spa = vd->vdev_spa;
+ ASSERT3P(vd->vdev_ops, ==, &vdev_raidz_ops);
+ vdev_raidz_t *vdrz = vd->vdev_tsd;
+
+ /*
+ * Make sure someone hasn't managed to sneak a fancy new vdev
+ * into a crufty old storage pool.
+ */
+ ASSERT(vdrz->vd_nparity == 1 ||
+ (vdrz->vd_nparity <= 2 &&
+ spa_version(spa) >= SPA_VERSION_RAIDZ2) ||
+ (vdrz->vd_nparity <= 3 &&
+ spa_version(spa) >= SPA_VERSION_RAIDZ3));
+
/*
- * At this point we've attempted to reconstruct the data given the
- * errors we detected, and we've attempted to read all columns. There
- * must, therefore, be one or more additional problems -- silent errors
- * resulting in invalid data rather than explicit I/O errors resulting
- * in absent data. We check if there is enough additional data to
- * possibly reconstruct the data and then perform combinatorial
- * reconstruction over all possible combinations. If that fails,
- * we're cooked.
+ * Note that we'll add these even on storage pools where they
+ * aren't strictly required -- older software will just ignore
+ * it.
*/
- if (total_errors > rm->rm_firstdatacol) {
- zio->io_error = vdev_raidz_worst_error(rm);
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vdrz->vd_nparity);
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_RAIDZ_LOGICAL_WIDTH,
+ vdrz->vd_logical_width);
+}
+
+/*
+ * Set RAIDZ-specific fields in the vdev_t, based on the config.
+ * Can't assume that anything about the vdev_t is already set.
+ * XXX add this to vdev_ops_t?
+ */
+void *
+vdev_raidz_get_tsd(spa_t *spa, nvlist_t *nv)
+{
+ uint64_t nparity, lw;
+ vdev_raidz_t *vdrz = kmem_zalloc(sizeof (*vdrz), KM_SLEEP);
+
+ uint_t children;
+ nvlist_t **child;
+ int error = nvlist_lookup_nvlist_array(nv,
+ ZPOOL_CONFIG_CHILDREN, &child, &children);
+ if (error != 0)
+ goto out;
+
+ vdrz->vd_logical_width = children;
+ vdrz->vd_physical_width = children;
+
+ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RAIDZ_LOGICAL_WIDTH,
+ &lw) == 0) {
+ vdrz->vd_logical_width = lw;
+ }
- } else if (total_errors < rm->rm_firstdatacol &&
- (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
+ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY,
+ &nparity) == 0) {
+ if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY)
+ goto out;
/*
- * If we didn't use all the available parity for the
- * combinatorial reconstruction, verify that the remaining
- * parity is correct.
+ * Previous versions could only support 1 or 2 parity
+ * device.
*/
- if (code != (1 << rm->rm_firstdatacol) - 1)
- (void) raidz_parity_verify(zio, rm);
+ if (nparity > 1 &&
+ spa_version(spa) < SPA_VERSION_RAIDZ2)
+ goto out;
+ if (nparity > 2 &&
+ spa_version(spa) < SPA_VERSION_RAIDZ3)
+ goto out;
} else {
/*
- * We're here because either:
- *
- * total_errors == rm_first_datacol, or
- * vdev_raidz_combrec() failed
- *
- * In either case, there is enough bad data to prevent
- * reconstruction.
- *
- * Start checksum ereports for all children which haven't
- * failed, and the IO wasn't speculative.
+ * We require the parity to be specified for SPAs that
+ * support multiple parity levels.
*/
- zio->io_error = SET_ERROR(ECKSUM);
-
- if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
- for (c = 0; c < rm->rm_cols; c++) {
- rc = &rm->rm_col[c];
- if (rc->rc_error == 0) {
- zio_bad_cksum_t zbc;
- zbc.zbc_has_cksum = 0;
- zbc.zbc_injected =
- rm->rm_ecksuminjected;
-
- zfs_ereport_start_checksum(
- zio->io_spa,
- vd->vdev_child[rc->rc_devidx],
- zio, rc->rc_offset, rc->rc_size,
- (void *)(uintptr_t)c, &zbc);
- }
- }
- }
- }
-
-done:
- zio_checksum_verified(zio);
-
- if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
- (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
+ if (spa_version(spa) >= SPA_VERSION_RAIDZ2)
+ goto out;
/*
- * Use the good data we have in hand to repair damaged children.
+ * Otherwise, we default to 1 parity device for RAID-Z.
*/
- for (c = 0; c < rm->rm_cols; c++) {
- rc = &rm->rm_col[c];
- cvd = vd->vdev_child[rc->rc_devidx];
-
- if (rc->rc_error == 0)
- continue;
-
- zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
- rc->rc_offset, rc->rc_abd, rc->rc_size,
- ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
- ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
- ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
- }
+ nparity = 1;
}
-}
-
-static void
-vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
-{
- if (faulted > vd->vdev_nparity)
- vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
- VDEV_AUX_NO_REPLICAS);
- else if (degraded + faulted != 0)
- vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
- else
- vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
+ vdrz->vd_nparity = nparity;
+ return (vdrz);
+out:
+ kmem_free(vdrz, sizeof (*vdrz));
+ return (NULL);
}
vdev_ops_t vdev_raidz_ops = {
Index: sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h
+++ sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h
@@ -556,6 +556,7 @@
#define ZPOOL_CONFIG_SPARES "spares"
#define ZPOOL_CONFIG_IS_SPARE "is_spare"
#define ZPOOL_CONFIG_NPARITY "nparity"
+#define ZPOOL_CONFIG_RAIDZ_LOGICAL_WIDTH "raidz_logical_width"
#define ZPOOL_CONFIG_HOSTID "hostid"
#define ZPOOL_CONFIG_HOSTNAME "hostname"
#define ZPOOL_CONFIG_LOADED_TIME "initial_load_time"

File Metadata

Mime Type
text/plain
Expires
Mon, Apr 20, 11:02 AM (2 h, 54 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
31834715
Default Alt Text
D15124.diff (103 KB)

Event Timeline