Page Menu
Home
FreeBSD
Search
Configure Global Search
Log In
Files
F153142100
D15124.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Flag For Later
Award Token
Size
103 KB
Referenced Files
None
Subscribers
None
D15124.diff
View Options
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/abd.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/abd.c
+++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/abd.c
@@ -758,6 +758,9 @@
int ret = 0;
struct abd_iter aiter;
+ if (size == 0)
+ return (ret);
+
abd_verify(abd);
ASSERT3U(off + size, <=, abd->abd_size);
@@ -886,6 +889,9 @@
int ret = 0;
struct abd_iter daiter, saiter;
+ if (size == 0)
+ return (ret);
+
abd_verify(dabd);
abd_verify(sabd);
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c
+++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c
@@ -91,7 +91,7 @@
{
reference_t *ref;
- ASSERT(rc->rc_count == number);
+ ASSERT3U(rc->rc_count, ==, number);
while (ref = list_head(&rc->rc_list)) {
list_remove(&rc->rc_list, ref);
kmem_cache_free(reference_cache, ref);
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
+++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
@@ -55,6 +55,7 @@
#include <sys/vdev_removal.h>
#include <sys/vdev_indirect_mapping.h>
#include <sys/vdev_indirect_births.h>
+#include <sys/vdev_raidz.h>
#include <sys/metaslab.h>
#include <sys/metaslab_impl.h>
#include <sys/uberblock_impl.h>
@@ -5923,8 +5924,9 @@
vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd;
vdev_ops_t *pvops;
char *oldvdpath, *newvdpath;
- int newvd_isspare;
+ int newvd_isspare = B_FALSE;
int error;
+ boolean_t raidz = B_FALSE;
ASSERT(spa_writeable(spa));
@@ -5947,10 +5949,16 @@
if (oldvd == NULL)
return (spa_vdev_exit(spa, NULL, txg, ENODEV));
- if (!oldvd->vdev_ops->vdev_op_leaf)
+ if (oldvd->vdev_ops == &vdev_raidz_ops) {
+ raidz = B_TRUE;
+ } else if (!oldvd->vdev_ops->vdev_op_leaf) {
return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
+ }
- pvd = oldvd->vdev_parent;
+ if (raidz)
+ pvd = oldvd;
+ else
+ pvd = oldvd->vdev_parent;
if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0,
VDEV_ALLOC_ATTACH)) != 0)
@@ -5979,6 +5987,7 @@
* vdev.
*/
if (pvd->vdev_ops != &vdev_mirror_ops &&
+ pvd->vdev_ops != &vdev_raidz_ops &&
pvd->vdev_ops != &vdev_root_ops)
return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
@@ -6018,7 +6027,8 @@
/*
* Make sure the new device is big enough.
*/
- if (newvd->vdev_asize < vdev_get_min_asize(oldvd))
+ vdev_t *min_vdev = raidz ? oldvd->vdev_child[0] : oldvd;
+ if (newvd->vdev_asize < vdev_get_min_asize(min_vdev))
return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW));
/*
@@ -6028,35 +6038,48 @@
if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift)
return (spa_vdev_exit(spa, newrootvd, txg, EDOM));
+ if (raidz) {
+ oldvdpath = kmem_asprintf("raidz%u-%u",
+ oldvd->vdev_nparity, oldvd->vdev_id);
+ } else {
+ oldvdpath = spa_strdup(oldvd->vdev_path);
+ }
+ newvdpath = spa_strdup(newvd->vdev_path);
+
/*
* If this is an in-place replacement, update oldvd's path and devid
* to make it distinguishable from newvd, and unopenable from now on.
*/
- if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) {
+ if (strcmp(oldvdpath, newvdpath) == 0) {
spa_strfree(oldvd->vdev_path);
- oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5,
+ oldvd->vdev_path = kmem_alloc(strlen(newvdpath) + 5,
KM_SLEEP);
- (void) sprintf(oldvd->vdev_path, "%s/%s",
- newvd->vdev_path, "old");
+ (void) sprintf(oldvd->vdev_path, "%s/old",
+ newvdpath);
if (oldvd->vdev_devid != NULL) {
spa_strfree(oldvd->vdev_devid);
oldvd->vdev_devid = NULL;
}
+ spa_strfree(oldvdpath);
+ oldvdpath = spa_strdup(oldvd->vdev_path);
}
/* mark the device being resilvered */
- newvd->vdev_resilver_txg = txg;
+ if (!raidz)
+ newvd->vdev_resilver_txg = txg;
/*
* If the parent is not a mirror, or if we're replacing, insert the new
* mirror/replacing/spare vdev above oldvd.
*/
- if (pvd->vdev_ops != pvops)
+ if (!raidz && pvd->vdev_ops != pvops)
pvd = vdev_add_parent(oldvd, pvops);
ASSERT(pvd->vdev_top->vdev_parent == rvd);
+#if 0
ASSERT(pvd->vdev_ops == pvops);
ASSERT(oldvd->vdev_parent == pvd);
+#endif
/*
* Extract the new device from its root and add it to pvd.
@@ -6079,29 +6102,34 @@
*/
dtl_max_txg = txg + TXG_CONCURRENT_STATES;
- vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL,
- dtl_max_txg - TXG_INITIAL);
+ if (raidz) {
+ dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
+ dsl_sync_task_nowait(spa->spa_dsl_pool, vdev_raidz_attach_sync,
+ newvd, 0, ZFS_SPACE_CHECK_EXTRA_RESERVED, tx);
+ dmu_tx_commit(tx);
+ } else {
+ vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL,
+ dtl_max_txg - TXG_INITIAL);
- if (newvd->vdev_isspare) {
- spa_spare_activate(newvd);
- spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_SPARE);
- }
+ if (newvd->vdev_isspare) {
+ spa_spare_activate(newvd);
+ spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_SPARE);
+ }
- oldvdpath = spa_strdup(oldvd->vdev_path);
- newvdpath = spa_strdup(newvd->vdev_path);
- newvd_isspare = newvd->vdev_isspare;
+ newvd_isspare = newvd->vdev_isspare;
- /*
- * Mark newvd's DTL dirty in this txg.
- */
- vdev_dirty(tvd, VDD_DTL, newvd, txg);
+ /*
+ * Mark newvd's DTL dirty in this txg.
+ */
+ vdev_dirty(tvd, VDD_DTL, newvd, txg);
- /*
- * Schedule the resilver to restart in the future. We do this to
- * ensure that dmu_sync-ed blocks have been stitched into the
- * respective datasets.
- */
- dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg);
+ /*
+ * Schedule the resilver to restart in the future. We do this to
+ * ensure that dmu_sync-ed blocks have been stitched into the
+ * respective datasets.
+ */
+ dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg);
+ }
if (spa->spa_bootfs)
spa_event_notify(spa, newvd, NULL, ESC_ZFS_BOOTFS_VDEV_ATTACH);
@@ -6113,6 +6141,10 @@
*/
(void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0);
+ if (raidz) {
+ error = vdev_online(spa, guid, ZFS_ONLINE_EXPAND, NULL);
+ }
+
spa_history_log_internal(spa, "vdev attach", NULL,
"%s vdev=%s %s vdev=%s",
replacing && newvd_isspare ? "spare in" :
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_raidz.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_raidz.h
+++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_raidz.h
@@ -39,10 +39,18 @@
extern "C" {
#endif
-#ifdef _KERNEL
+typedef struct vdev_raidz {
+ int vd_logical_width;
+ int vd_physical_width;
+ int vd_nparity;
+ boolean_t vn_expanding;
+} vdev_raidz_t;
+
extern int vdev_raidz_physio(vdev_t *,
caddr_t, size_t, uint64_t, uint64_t, boolean_t, boolean_t);
-#endif
+extern void vdev_raidz_attach_sync(void *, dmu_tx_t *);
+extern void vdev_raidz_config_generate(vdev_t *, nvlist_t *);
+extern void *vdev_raidz_get_tsd(spa_t *, nvlist_t *);
#ifdef __cplusplus
}
#endif
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
+++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
@@ -49,6 +49,7 @@
#include <sys/arc.h>
#include <sys/zil.h>
#include <sys/dsl_scan.h>
+#include <sys/vdev_raidz.h>
#include <sys/abd.h>
#include <sys/trim_map.h>
@@ -584,7 +585,7 @@
{
vdev_ops_t *ops;
char *type;
- uint64_t guid = 0, islog, nparity;
+ uint64_t guid = 0, islog;
vdev_t *vd;
vdev_indirect_config_t *vic;
@@ -637,47 +638,21 @@
if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES)
return (SET_ERROR(ENOTSUP));
- /*
- * Set the nparity property for RAID-Z vdevs.
- */
- nparity = -1ULL;
+ void *tsd = NULL;
+ int nparity = 0;
if (ops == &vdev_raidz_ops) {
- if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY,
- &nparity) == 0) {
- if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY)
- return (SET_ERROR(EINVAL));
- /*
- * Previous versions could only support 1 or 2 parity
- * device.
- */
- if (nparity > 1 &&
- spa_version(spa) < SPA_VERSION_RAIDZ2)
- return (SET_ERROR(ENOTSUP));
- if (nparity > 2 &&
- spa_version(spa) < SPA_VERSION_RAIDZ3)
- return (SET_ERROR(ENOTSUP));
- } else {
- /*
- * We require the parity to be specified for SPAs that
- * support multiple parity levels.
- */
- if (spa_version(spa) >= SPA_VERSION_RAIDZ2)
- return (SET_ERROR(EINVAL));
- /*
- * Otherwise, we default to 1 parity device for RAID-Z.
- */
- nparity = 1;
- }
- } else {
- nparity = 0;
+ vdev_raidz_t *rz = tsd = vdev_raidz_get_tsd(spa, nv);
+ if (rz == NULL)
+ return (SET_ERROR(EINVAL));
+ nparity = rz->vd_nparity;
}
- ASSERT(nparity != -1ULL);
vd = vdev_alloc_common(spa, id, guid, ops);
vic = &vd->vdev_indirect_config;
vd->vdev_islog = islog;
vd->vdev_nparity = nparity;
+ vd->vdev_tsd = tsd;
if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0)
vd->vdev_path = spa_strdup(vd->vdev_path);
@@ -849,6 +824,11 @@
ASSERT(vd->vdev_child == NULL);
ASSERT(vd->vdev_guid_sum == vd->vdev_guid);
+ if (vd->vdev_ops == &vdev_raidz_ops) {
+ vdev_raidz_t *rz = vd->vdev_tsd;
+ kmem_free(rz, sizeof(*rz));
+ }
+
/*
* Discard allocation state.
*/
@@ -3155,8 +3135,10 @@
if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
return (spa_vdev_state_exit(spa, NULL, ENODEV));
+#if 0
if (!vd->vdev_ops->vdev_op_leaf)
return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
+#endif
wasoffline = (vd->vdev_offline || vd->vdev_tmpoffline);
oldstate = vd->vdev_state;
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect.c
+++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect.c
@@ -1078,6 +1078,13 @@
if (vd->vdev_ops == &vdev_indirect_ops)
return;
+ printf("vdev_indirect_io_start_cb: src=%llx split_offset=%x dst: vd=%u off=%llx size=%x\n",
+ (long long)zio->io_offset,
+ (int)split_offset,
+ (int)vd->vdev_id,
+ (long long)offset,
+ (int)size);
+
zio_nowait(zio_vdev_child_io(zio, NULL, vd, offset,
abd_get_offset(zio->io_abd, split_offset),
size, zio->io_type, zio->io_priority,
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c
+++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c
@@ -141,6 +141,7 @@
#include <sys/zap.h>
#include <sys/vdev.h>
#include <sys/vdev_impl.h>
+#include <sys/vdev_raidz.h>
#include <sys/uberblock_impl.h>
#include <sys/metaslab.h>
#include <sys/metaslab_impl.h>
@@ -276,31 +277,13 @@
if (vd->vdev_fru != NULL)
fnvlist_add_string(nv, ZPOOL_CONFIG_FRU, vd->vdev_fru);
- if (vd->vdev_nparity != 0) {
- ASSERT(strcmp(vd->vdev_ops->vdev_op_type,
- VDEV_TYPE_RAIDZ) == 0);
+ if (vd->vdev_ops == &vdev_raidz_ops)
+ vdev_raidz_config_generate(vd, nv);
- /*
- * Make sure someone hasn't managed to sneak a fancy new vdev
- * into a crufty old storage pool.
- */
- ASSERT(vd->vdev_nparity == 1 ||
- (vd->vdev_nparity <= 2 &&
- spa_version(spa) >= SPA_VERSION_RAIDZ2) ||
- (vd->vdev_nparity <= 3 &&
- spa_version(spa) >= SPA_VERSION_RAIDZ3));
-
- /*
- * Note that we'll add the nparity tag even on storage pools
- * that only support a single parity device -- older software
- * will just ignore it.
- */
- fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vd->vdev_nparity);
- }
-
- if (vd->vdev_wholedisk != -1ULL)
+ if (vd->vdev_wholedisk != -1ULL) {
fnvlist_add_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
vd->vdev_wholedisk);
+ }
if (vd->vdev_not_present && !(flags & VDEV_CONFIG_MISSING))
fnvlist_add_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 1);
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c
+++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c
@@ -28,12 +28,14 @@
#include <sys/zfs_context.h>
#include <sys/spa.h>
+#include <sys/zap.h>
#include <sys/vdev_impl.h>
#ifdef illumos
#include <sys/vdev_disk.h>
#endif
#include <sys/vdev_file.h>
#include <sys/vdev_raidz.h>
+#include <sys/metaslab_impl.h>
#include <sys/zio.h>
#include <sys/zio_checksum.h>
#include <sys/abd.h>
@@ -41,6 +43,12 @@
#include <sys/fm/fs/zfs.h>
#include <sys/bio.h>
+#if 0
+#ifdef ZFS_DEBUG
+#include <sys/vdev_initialize.h> /* vdev_xlate testing */
+#endif
+#endif
+
/*
* Virtual device vector for RAID-Z.
*
@@ -113,27 +121,31 @@
uint64_t rc_offset; /* device offset */
uint64_t rc_size; /* I/O size */
abd_t *rc_abd; /* I/O data */
+ void *rc_orig_data; /* pre-reconstruction */
void *rc_gdata; /* used to store the "good" version */
int rc_error; /* I/O error for this device */
uint8_t rc_tried; /* Did we attempt this I/O column? */
uint8_t rc_skipped; /* Did we skip this I/O column? */
+ uint8_t rc_need_orig_restore; /* need to restore from orig_data? */
} raidz_col_t;
+typedef struct raidz_row {
+ uint64_t rr_cols; /* Regular column count */
+ uint64_t rr_missingdata; /* Count of missing data devices */
+ uint64_t rr_missingparity; /* Count of missing parity devices */
+ uint64_t rr_firstdatacol; /* First data column/parity count */
+ abd_t *rr_abd_copy; /* rm_asize-buffer of copied data */
+ int rr_code; /* reconstruction code */
+ raidz_col_t rr_col[0]; /* Flexible array of I/O columns */
+} raidz_row_t;
+
typedef struct raidz_map {
- uint64_t rm_cols; /* Regular column count */
- uint64_t rm_scols; /* Count including skipped columns */
- uint64_t rm_bigcols; /* Number of oversized columns */
- uint64_t rm_asize; /* Actual total I/O size */
- uint64_t rm_missingdata; /* Count of missing data devices */
- uint64_t rm_missingparity; /* Count of missing parity devices */
- uint64_t rm_firstdatacol; /* First data column/parity count */
- uint64_t rm_nskip; /* Skipped sectors for padding */
- uint64_t rm_skipstart; /* Column index of padding start */
- abd_t *rm_abd_copy; /* rm_asize-buffer of copied data */
uintptr_t rm_reports; /* # of referencing checksum reports */
- uint8_t rm_freed; /* map no longer has referencing ZIO */
- uint8_t rm_ecksuminjected; /* checksum error was injected */
- raidz_col_t rm_col[1]; /* Flexible array of I/O columns */
+ boolean_t rm_freed; /* map no longer has referencing ZIO */
+ boolean_t rm_ecksuminjected; /* checksum error was injected */
+ int rm_nrows;
+ int rm_nskip; /* Sectors skipped for padding */
+ raidz_row_t *rm_row[0]; /* flexible array of rows */
} raidz_map_t;
#define VDEV_RAIDZ_P 0
@@ -241,7 +253,7 @@
0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
};
-static void vdev_raidz_generate_parity(raidz_map_t *rm);
+static void vdev_raidz_generate_parity(raidz_row_t *);
/*
* Multiply a given number by 2 raised to the given power.
@@ -263,31 +275,46 @@
}
static void
-vdev_raidz_map_free(raidz_map_t *rm)
+vdev_raidz_row_free(raidz_row_t *rr)
{
int c;
- size_t size;
- for (c = 0; c < rm->rm_firstdatacol; c++) {
- if (rm->rm_col[c].rc_abd != NULL)
- abd_free(rm->rm_col[c].rc_abd);
+ for (c = 0; c < rr->rr_firstdatacol && c < rr->rr_cols; c++) {
+ if (rr->rr_col[c].rc_abd != NULL)
+ abd_free(rr->rr_col[c].rc_abd);
- if (rm->rm_col[c].rc_gdata != NULL)
- zio_buf_free(rm->rm_col[c].rc_gdata,
- rm->rm_col[c].rc_size);
+ if (rr->rr_col[c].rc_gdata != NULL) {
+ zio_buf_free(rr->rr_col[c].rc_gdata,
+ rr->rr_col[c].rc_size);
+ }
+ if (rr->rr_col[c].rc_orig_data != NULL) {
+ zio_buf_free(rr->rr_col[c].rc_orig_data,
+ rr->rr_col[c].rc_size);
+ }
}
- size = 0;
- for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
- if (rm->rm_col[c].rc_abd != NULL)
- abd_put(rm->rm_col[c].rc_abd);
- size += rm->rm_col[c].rc_size;
+ for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
+ if (rr->rr_col[c].rc_abd != NULL)
+ abd_put(rr->rr_col[c].rc_abd);
+ if (rr->rr_col[c].rc_orig_data != NULL) {
+ zio_buf_free(rr->rr_col[c].rc_orig_data,
+ rr->rr_col[c].rc_size);
+ }
}
- if (rm->rm_abd_copy != NULL)
- abd_free(rm->rm_abd_copy);
+ if (rr->rr_abd_copy != NULL)
+ abd_free(rr->rr_abd_copy);
- kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
+ kmem_free(rr, offsetof(raidz_row_t, rr_col[rr->rr_cols]));
+}
+
+static void
+vdev_raidz_map_free(raidz_map_t *rm)
+{
+ for (int i = 0; i < rm->rm_nrows; i++) {
+ vdev_raidz_row_free(rm->rm_row[i]);
+ }
+ kmem_free(rm, offsetof(raidz_map_t, rm_row[rm->rm_nrows]));
}
static void
@@ -296,10 +323,11 @@
raidz_map_t *rm = zio->io_vsd;
ASSERT0(rm->rm_freed);
- rm->rm_freed = 1;
+ rm->rm_freed = B_TRUE;
- if (rm->rm_reports == 0)
+ if (rm->rm_reports == 0) {
vdev_raidz_map_free(rm);
+ }
}
/*ARGSUSED*/
@@ -310,7 +338,7 @@
ASSERT3U(rm->rm_reports, >, 0);
- if (--rm->rm_reports == 0 && rm->rm_freed != 0)
+ if (--rm->rm_reports == 0 && rm->rm_freed)
vdev_raidz_map_free(rm);
}
@@ -324,18 +352,22 @@
const char *good = NULL;
char *bad;
+ zfs_dbgmsg("checksum error on rm=%p", rm);
+
if (good_data == NULL) {
zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
return;
}
- if (c < rm->rm_firstdatacol) {
+ zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
+#if 0
+ if (c < rm->rr_firstdatacol) {
/*
* The first time through, calculate the parity blocks for
* the good data (this relies on the fact that the good
* data never changes for a given logical ZIO)
*/
- if (rm->rm_col[0].rc_gdata == NULL) {
+ if (rm->rr_col[0].rc_gdata == NULL) {
abd_t *bad_parity[VDEV_RAIDZ_MAXPARITY];
char *buf;
int offset;
@@ -345,22 +377,22 @@
* good_data, first saving the parity bufs and
* replacing them with buffers to hold the result.
*/
- for (x = 0; x < rm->rm_firstdatacol; x++) {
- bad_parity[x] = rm->rm_col[x].rc_abd;
- rm->rm_col[x].rc_gdata =
- zio_buf_alloc(rm->rm_col[x].rc_size);
- rm->rm_col[x].rc_abd =
- abd_get_from_buf(rm->rm_col[x].rc_gdata,
- rm->rm_col[x].rc_size);
+ for (x = 0; x < rm->rr_firstdatacol; x++) {
+ bad_parity[x] = rm->rr_col[x].rc_abd;
+ rm->rr_col[x].rc_gdata =
+ zio_buf_alloc(rm->rr_col[x].rc_size);
+ rm->rr_col[x].rc_abd =
+ abd_get_from_buf(rm->rr_col[x].rc_gdata,
+ rm->rr_col[x].rc_size);
}
/* fill in the data columns from good_data */
buf = (char *)good_data;
- for (; x < rm->rm_cols; x++) {
- abd_put(rm->rm_col[x].rc_abd);
- rm->rm_col[x].rc_abd = abd_get_from_buf(buf,
- rm->rm_col[x].rc_size);
- buf += rm->rm_col[x].rc_size;
+ for (; x < rm->rr_cols; x++) {
+ abd_put(rm->rr_col[x].rc_abd);
+ rm->rr_col[x].rc_abd = abd_get_from_buf(buf,
+ rm->rr_col[x].rc_size);
+ buf += rm->rr_col[x].rc_size;
}
/*
@@ -369,34 +401,35 @@
vdev_raidz_generate_parity(rm);
/* restore everything back to its original state */
- for (x = 0; x < rm->rm_firstdatacol; x++) {
- abd_put(rm->rm_col[x].rc_abd);
- rm->rm_col[x].rc_abd = bad_parity[x];
+ for (x = 0; x < rm->rr_firstdatacol; x++) {
+ abd_put(rm->rr_col[x].rc_abd);
+ rm->rr_col[x].rc_abd = bad_parity[x];
}
offset = 0;
- for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
- abd_put(rm->rm_col[x].rc_abd);
- rm->rm_col[x].rc_abd = abd_get_offset(
- rm->rm_abd_copy, offset);
- offset += rm->rm_col[x].rc_size;
+ for (x = rm->rr_firstdatacol; x < rm->rr_cols; x++) {
+ abd_put(rm->rr_col[x].rc_abd);
+ rm->rr_col[x].rc_abd = abd_get_offset(
+ rm->rr_abd_copy, offset);
+ offset += rm->rr_col[x].rc_size;
}
}
- ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
- good = rm->rm_col[c].rc_gdata;
+ ASSERT3P(rm->rr_col[c].rc_gdata, !=, NULL);
+ good = rm->rr_col[c].rc_gdata;
} else {
/* adjust good_data to point at the start of our column */
good = good_data;
- for (x = rm->rm_firstdatacol; x < c; x++)
- good += rm->rm_col[x].rc_size;
+ for (x = rm->rr_firstdatacol; x < c; x++)
+ good += rm->rr_col[x].rc_size;
}
- bad = abd_borrow_buf_copy(rm->rm_col[c].rc_abd, rm->rm_col[c].rc_size);
+ bad = abd_borrow_buf_copy(rm->rr_col[c].rc_abd, rm->rr_col[c].rc_size);
/* we drop the ereport if it ends up that the data was good */
zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
- abd_return_buf(rm->rm_col[c].rc_abd, bad, rm->rm_col[c].rc_size);
+ abd_return_buf(rm->rr_col[c].rc_abd, bad, rm->rr_col[c].rc_size);
+#endif
}
/*
@@ -409,10 +442,7 @@
vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
{
size_t c = (size_t)(uintptr_t)arg;
- size_t offset;
-
raidz_map_t *rm = zio->io_vsd;
- size_t size;
/* set up the report and bump the refcount */
zcr->zcr_cbdata = rm;
@@ -423,7 +453,7 @@
rm->rm_reports++;
ASSERT3U(rm->rm_reports, >, 0);
- if (rm->rm_abd_copy != NULL)
+ if (rm->rm_row[0]->rr_abd_copy != NULL)
return;
/*
@@ -435,24 +465,33 @@
* to copy them.
*/
- size = 0;
- for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
- size += rm->rm_col[c].rc_size;
+ for (int i = 0; i < rm->rm_nrows; i++) {
+ raidz_row_t *rr = rm->rm_row[i];
+ size_t offset;
+ size_t size = 0;
+ for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++)
+ size += rr->rr_col[c].rc_size;
+
+ rr->rr_abd_copy =
+ abd_alloc_sametype(rr->rr_col[rr->rr_firstdatacol].rc_abd,
+ size);
- rm->rm_abd_copy =
- abd_alloc_sametype(rm->rm_col[rm->rm_firstdatacol].rc_abd, size);
+ for (offset = 0, c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
+ raidz_col_t *col = &rr->rr_col[c];
+
+ if (col->rc_size == 0)
+ continue;
- for (offset = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
- raidz_col_t *col = &rm->rm_col[c];
- abd_t *tmp = abd_get_offset(rm->rm_abd_copy, offset);
+ abd_t *tmp = abd_get_offset(rr->rr_abd_copy, offset);
- abd_copy(tmp, col->rc_abd, col->rc_size);
- abd_put(col->rc_abd);
- col->rc_abd = tmp;
+ abd_copy(tmp, col->rc_abd, col->rc_size);
+ abd_put(col->rc_abd);
+ col->rc_abd = tmp;
- offset += col->rc_size;
+ offset += col->rc_size;
+ }
+ ASSERT3U(offset, ==, size);
}
- ASSERT3U(offset, ==, size);
}
static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
@@ -468,7 +507,7 @@
vdev_raidz_map_alloc(abd_t *abd, uint64_t size, uint64_t offset, boolean_t dofree,
uint64_t unit_shift, uint64_t dcols, uint64_t nparity)
{
- raidz_map_t *rm;
+ raidz_row_t *rr;
/* The starting RAIDZ (parent) vdev sector of the block. */
uint64_t b = offset >> unit_shift;
/* The zio's size in units of the vdev's minimum sector size. */
@@ -477,9 +516,13 @@
uint64_t f = b % dcols;
/* The starting byte offset on each child vdev. */
uint64_t o = (b / dcols) << unit_shift;
- uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
+ uint64_t q, r, c, bc, col, acols, coff, devidx, asize, tot;
uint64_t off = 0;
+ raidz_map_t *rm =
+ kmem_zalloc(offsetof(raidz_map_t, rm_row[1]), KM_SLEEP);
+ rm->rm_nrows = 1;
+
/*
* "Quotient": The number of data sectors for this stripe on all but
* the "big column" child vdevs that also contain "remainder" data.
@@ -502,77 +545,63 @@
tot = s + nparity * (q + (r == 0 ? 0 : 1));
/* acols: The columns that will be accessed. */
- /* scols: The columns that will be accessed or skipped. */
if (q == 0) {
/* Our I/O request doesn't span all child vdevs. */
acols = bc;
- scols = MIN(dcols, roundup(bc, nparity + 1));
} else {
acols = dcols;
- scols = dcols;
}
- ASSERT3U(acols, <=, scols);
-
- rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
+ rr = kmem_alloc(offsetof(raidz_row_t, rr_col[acols]), KM_SLEEP);
+ rm->rm_row[0] = rr;
- rm->rm_cols = acols;
- rm->rm_scols = scols;
- rm->rm_bigcols = bc;
- rm->rm_skipstart = bc;
- rm->rm_missingdata = 0;
- rm->rm_missingparity = 0;
- rm->rm_firstdatacol = nparity;
- rm->rm_abd_copy = NULL;
- rm->rm_reports = 0;
- rm->rm_freed = 0;
- rm->rm_ecksuminjected = 0;
+ rr->rr_cols = acols;
+ rr->rr_missingdata = 0;
+ rr->rr_missingparity = 0;
+ rr->rr_firstdatacol = nparity;
+ rr->rr_abd_copy = NULL;
asize = 0;
- for (c = 0; c < scols; c++) {
+ for (c = 0; c < acols; c++) {
col = f + c;
coff = o;
if (col >= dcols) {
col -= dcols;
coff += 1ULL << unit_shift;
}
- rm->rm_col[c].rc_devidx = col;
- rm->rm_col[c].rc_offset = coff;
- rm->rm_col[c].rc_abd = NULL;
- rm->rm_col[c].rc_gdata = NULL;
- rm->rm_col[c].rc_error = 0;
- rm->rm_col[c].rc_tried = 0;
- rm->rm_col[c].rc_skipped = 0;
-
- if (c >= acols)
- rm->rm_col[c].rc_size = 0;
- else if (c < bc)
- rm->rm_col[c].rc_size = (q + 1) << unit_shift;
+ rr->rr_col[c].rc_devidx = col;
+ rr->rr_col[c].rc_offset = coff;
+ rr->rr_col[c].rc_abd = NULL;
+ rr->rr_col[c].rc_gdata = NULL;
+ rr->rr_col[c].rc_orig_data = NULL;
+ rr->rr_col[c].rc_error = 0;
+ rr->rr_col[c].rc_tried = 0;
+ rr->rr_col[c].rc_skipped = 0;
+ rr->rr_col[c].rc_need_orig_restore = B_FALSE;
+
+ if (c < bc)
+ rr->rr_col[c].rc_size = (q + 1) << unit_shift;
else
- rm->rm_col[c].rc_size = q << unit_shift;
+ rr->rr_col[c].rc_size = q << unit_shift;
- asize += rm->rm_col[c].rc_size;
+ asize += rr->rr_col[c].rc_size;
}
ASSERT3U(asize, ==, tot << unit_shift);
- rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
rm->rm_nskip = roundup(tot, nparity + 1) - tot;
- ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
- ASSERT3U(rm->rm_nskip, <=, nparity);
if (!dofree) {
- for (c = 0; c < rm->rm_firstdatacol; c++) {
- rm->rm_col[c].rc_abd =
- abd_alloc_linear(rm->rm_col[c].rc_size, B_TRUE);
- }
+ for (c = 0; c < rr->rr_firstdatacol; c++)
+ rr->rr_col[c].rc_abd =
+ abd_alloc_linear(rr->rr_col[c].rc_size, B_TRUE);
- rm->rm_col[c].rc_abd = abd_get_offset(abd, 0);
- off = rm->rm_col[c].rc_size;
+ rr->rr_col[c].rc_abd = abd_get_offset(abd, 0);
+ off = rr->rr_col[c].rc_size;
for (c = c + 1; c < acols; c++) {
- rm->rm_col[c].rc_abd = abd_get_offset(abd, off);
- off += rm->rm_col[c].rc_size;
+ rr->rr_col[c].rc_abd = abd_get_offset(abd, off);
+ off += rr->rr_col[c].rc_size;
}
}
@@ -596,20 +625,182 @@
* skip the first column since at least one data and one parity
* column must appear in each row.
*/
- ASSERT(rm->rm_cols >= 2);
- ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
+ ASSERT(rr->rr_cols >= 2);
+ ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size);
+
+ if (rr->rr_firstdatacol == 1 && (offset & (1ULL << 20))) {
+ devidx = rr->rr_col[0].rc_devidx;
+ o = rr->rr_col[0].rc_offset;
+ rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx;
+ rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset;
+ rr->rr_col[1].rc_devidx = devidx;
+ rr->rr_col[1].rc_offset = o;
+ }
+
+ return (rm);
+}
+
+static raidz_map_t *
+vdev_raidz_map_alloc_expanded(abd_t *abd, uint64_t size, uint64_t offset, boolean_t dofree,
+ uint64_t ashift, uint64_t physical_cols, uint64_t logical_cols,
+ uint64_t nparity)
+{
+ /* The starting RAIDZ (parent) vdev sector of the block. */
+ uint64_t b = offset >> ashift;
+ /* The zio's size in units of the vdev's minimum sector size. */
+ uint64_t s = size >> ashift;
+ uint64_t cur_col = b % physical_cols;
+ /* The starting byte offset on each child vdev. */
+ uint64_t child_offset = (b / physical_cols) << ashift;
+ uint64_t q, r, bc, devidx, asize, tot;
+
+ /*
+ * "Quotient": The number of data sectors for this stripe on all but
+ * the "big column" child vdevs that also contain "remainder" data.
+ * AKA "full rows"
+ */
+ q = s / (logical_cols - nparity);
+
+ /*
+ * "Remainder": The number of partial stripe data sectors in this I/O.
+ * This will add a sector to some, but not all, child vdevs.
+ */
+ r = s - q * (logical_cols - nparity);
+
+ /* The number of "big columns" - those which contain remainder data. */
+ bc = (r == 0 ? 0 : r + nparity);
+
+ /*
+ * The total number of data and parity sectors associated with
+ * this I/O.
+ */
+ tot = s + nparity * (q + (r == 0 ? 0 : 1));
+
+ /* How many rows contain data (not skip) */
+ uint64_t rows = howmany(tot, logical_cols);
+ int cols = MIN(tot, logical_cols);
+
+ raidz_map_t *rm =
+ kmem_zalloc(offsetof(raidz_map_t, rm_row[rows]),
+ KM_SLEEP);
+ rm->rm_nrows = rows;
+ rm->rm_nskip = roundup(tot, nparity + 1) - tot;
+ asize = 0;
- if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
- devidx = rm->rm_col[0].rc_devidx;
- o = rm->rm_col[0].rc_offset;
- rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
- rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
- rm->rm_col[1].rc_devidx = devidx;
- rm->rm_col[1].rc_offset = o;
+ zfs_dbgmsg("rm=%p s=%d q=%d r=%d bc=%d nrows=%d cols=%d",
+ rm, (int)s, (int)q, (int)r, (int)bc, (int)rows, (int)cols);
+
+ for (uint64_t row = 0; row < rows; row++) {
+ raidz_row_t *rr = kmem_alloc(offsetof(raidz_row_t,
+ rr_col[cols]), KM_SLEEP);
+ rm->rm_row[row] = rr;
+
+ /*
+ * We set cols to the entire width of the block, even
+ * if this row is shorter. This is needed because parity
+ * generation (for Q and R) needs to know the entire width,
+ * because it treats the short row as though it was
+ * full-width (and the "phantom" sectors were zero-filled).
+ *
+ * Another approach to this would be to set cols shorter
+ * (to just the number of columns that we might do i/o to)
+ * and have another mechanism to tell the parity generation
+ * about the "entire width". Reconstruction (at least
+ * vdev_raidz_reconstruct_general()) would also need to
+ * know about the "entire width".
+ */
+ rr->rr_cols = cols;
+ rr->rr_missingdata = 0;
+ rr->rr_missingparity = 0;
+ rr->rr_firstdatacol = nparity;
+ rr->rr_abd_copy = NULL;
+
+ for (int c = 0; c < rr->rr_cols; c++, cur_col++) {
+ if (cur_col >= physical_cols) {
+ cur_col -= physical_cols;
+ child_offset += 1ULL << ashift;
+ }
+ rr->rr_col[c].rc_devidx = cur_col;
+ rr->rr_col[c].rc_offset = child_offset;
+ rr->rr_col[c].rc_gdata = NULL;
+ rr->rr_col[c].rc_orig_data = NULL;
+ rr->rr_col[c].rc_error = 0;
+ rr->rr_col[c].rc_tried = 0;
+ rr->rr_col[c].rc_skipped = 0;
+ rr->rr_col[c].rc_abd = NULL;
+ rr->rr_col[c].rc_need_orig_restore = B_FALSE;
+
+ uint64_t dc = c - rr->rr_firstdatacol;
+ if (c < rr->rr_firstdatacol) {
+ rr->rr_col[c].rc_size = 1ULL << ashift;
+ if (!dofree) {
+ rr->rr_col[c].rc_abd =
+ abd_alloc_linear(rr->rr_col[c].rc_size,
+ B_TRUE);
+ }
+ } else if (row == rows - 1 && bc != 0 && c >= bc) {
+ /*
+ * Past the end, this for parity generation.
+ */
+ rr->rr_col[c].rc_size = 0;
+ rr->rr_col[c].rc_abd = NULL;
+ } else {
+ /* XXX ASCII art diagram here */
+ /* "data column" (col excluding parity) */
+ uint64_t off;
+
+ if (c < bc || r == 0) {
+ off = dc * rows + row;
+ } else {
+ off = r * rows +
+ (dc - r) * (rows - 1) + row;
+ }
+ zfs_dbgmsg("rm=%p row=%d c=%d dc=%d off=%u devidx=%u",
+ rm, (int)row, (int)c, (int)dc, (int)off, (int)cur_col);
+ rr->rr_col[c].rc_size = 1ULL << ashift;
+ if (!dofree) {
+ rr->rr_col[c].rc_abd =
+ abd_get_offset(abd, off << ashift);
+ }
+ }
+
+ asize += rr->rr_col[c].rc_size;
+ }
+
+ /*
+ * If all data stored spans all columns, there's a danger that parity
+ * will always be on the same device and, since parity isn't read
+ * during normal operation, that that device's I/O bandwidth won't be
+ * used effectively. We therefore switch the parity every 1MB.
+ *
+ * ... at least that was, ostensibly, the theory. As a practical
+ * matter unless we juggle the parity between all devices evenly, we
+ * won't see any benefit. Further, occasional writes that aren't a
+ * multiple of the LCM of the number of children and the minimum
+ * stripe width are sufficient to avoid pessimal behavior.
+ * Unfortunately, this decision created an implicit on-disk format
+ * requirement that we need to support for all eternity, but only
+ * for single-parity RAID-Z.
+ *
+ * If we intend to skip a sector in the zeroth column for padding
+ * we must make sure to note this swap. We will never intend to
+ * skip the first column since at least one data and one parity
+ * column must appear in each row.
+ */
+ if (rr->rr_firstdatacol == 1 && rr->rr_cols > 1 &&
+ (offset & (1ULL << 20))) {
+ ASSERT(rr->rr_cols >= 2);
+ ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size);
+ devidx = rr->rr_col[0].rc_devidx;
+ uint64_t o = rr->rr_col[0].rc_offset;
+ rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx;
+ rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset;
+ rr->rr_col[1].rc_devidx = devidx;
+ rr->rr_col[1].rc_offset = o;
+ }
- if (rm->rm_skipstart == 0)
- rm->rm_skipstart = 1;
}
+ ASSERT3U(asize, ==, tot << ashift);
return (rm);
}
@@ -676,55 +867,48 @@
}
static void
-vdev_raidz_generate_parity_p(raidz_map_t *rm)
+vdev_raidz_generate_parity_p(raidz_row_t *rr)
{
- uint64_t *p;
- int c;
- abd_t *src;
+ uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
- for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
- src = rm->rm_col[c].rc_abd;
- p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
+ for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
+ abd_t *src = rr->rr_col[c].rc_abd;
- if (c == rm->rm_firstdatacol) {
- abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
+ if (c == rr->rr_firstdatacol) {
+ abd_copy_to_buf(p, src, rr->rr_col[c].rc_size);
} else {
struct pqr_struct pqr = { p, NULL, NULL };
- (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
+ (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size,
vdev_raidz_p_func, &pqr);
}
}
}
static void
-vdev_raidz_generate_parity_pq(raidz_map_t *rm)
+vdev_raidz_generate_parity_pq(raidz_row_t *rr)
{
- uint64_t *p, *q, pcnt, ccnt, mask, i;
- int c;
- abd_t *src;
+ uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
+ uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
+ uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
+ ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size ==
+ rr->rr_col[VDEV_RAIDZ_Q].rc_size);
- pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
- ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
- rm->rm_col[VDEV_RAIDZ_Q].rc_size);
+ for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
+ abd_t *src = rr->rr_col[c].rc_abd;
- for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
- src = rm->rm_col[c].rc_abd;
- p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
- q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
+ uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]);
- ccnt = rm->rm_col[c].rc_size / sizeof (p[0]);
-
- if (c == rm->rm_firstdatacol) {
- abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
- (void) memcpy(q, p, rm->rm_col[c].rc_size);
+ if (c == rr->rr_firstdatacol) {
+ abd_copy_to_buf(p, src, rr->rr_col[c].rc_size);
+ (void) memcpy(q, p, rr->rr_col[c].rc_size);
} else {
struct pqr_struct pqr = { p, q, NULL };
- (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
+ (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size,
vdev_raidz_pq_func, &pqr);
}
- if (c == rm->rm_firstdatacol) {
- for (i = ccnt; i < pcnt; i++) {
+ if (c == rr->rr_firstdatacol) {
+ for (uint64_t i = ccnt; i < pcnt; i++) {
p[i] = 0;
q[i] = 0;
}
@@ -733,7 +917,8 @@
* Treat short columns as though they are full of 0s.
* Note that there's therefore nothing needed for P.
*/
- for (i = ccnt; i < pcnt; i++) {
+ uint64_t mask;
+ for (uint64_t i = ccnt; i < pcnt; i++) {
VDEV_RAIDZ_64MUL_2(q[i], mask);
}
}
@@ -741,38 +926,35 @@
}
static void
-vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
+vdev_raidz_generate_parity_pqr(raidz_row_t *rr)
{
- uint64_t *p, *q, *r, pcnt, ccnt, mask, i;
- int c;
- abd_t *src;
-
- pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
- ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
- rm->rm_col[VDEV_RAIDZ_Q].rc_size);
- ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
- rm->rm_col[VDEV_RAIDZ_R].rc_size);
-
- for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
- src = rm->rm_col[c].rc_abd;
- p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
- q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
- r = abd_to_buf(rm->rm_col[VDEV_RAIDZ_R].rc_abd);
-
- ccnt = rm->rm_col[c].rc_size / sizeof (p[0]);
-
- if (c == rm->rm_firstdatacol) {
- abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
- (void) memcpy(q, p, rm->rm_col[c].rc_size);
- (void) memcpy(r, p, rm->rm_col[c].rc_size);
+ uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
+ uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
+ uint64_t *r = abd_to_buf(rr->rr_col[VDEV_RAIDZ_R].rc_abd);
+ uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
+ ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size ==
+ rr->rr_col[VDEV_RAIDZ_Q].rc_size);
+ ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size ==
+ rr->rr_col[VDEV_RAIDZ_R].rc_size);
+
+ for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
+ abd_t *src = rr->rr_col[c].rc_abd;
+
+ uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]);
+
+ if (c == rr->rr_firstdatacol) {
+ abd_copy_to_buf(p, src, rr->rr_col[c].rc_size);
+ (void) memcpy(q, p, rr->rr_col[c].rc_size);
+ (void) memcpy(r, p, rr->rr_col[c].rc_size);
} else {
struct pqr_struct pqr = { p, q, r };
- (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
+ (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size,
vdev_raidz_pqr_func, &pqr);
}
- if (c == rm->rm_firstdatacol) {
- for (i = ccnt; i < pcnt; i++) {
+ if (c == rr->rr_firstdatacol) {
+ for (uint64_t i = ccnt; i < pcnt; i++) {
+ /* XXX does this really happen? firstdatacol should be the same size as the parity cols */
p[i] = 0;
q[i] = 0;
r[i] = 0;
@@ -782,7 +964,8 @@
* Treat short columns as though they are full of 0s.
* Note that there's therefore nothing needed for P.
*/
- for (i = ccnt; i < pcnt; i++) {
+ uint64_t mask;
+ for (uint64_t i = ccnt; i < pcnt; i++) {
VDEV_RAIDZ_64MUL_2(q[i], mask);
VDEV_RAIDZ_64MUL_4(r[i], mask);
}
@@ -795,17 +978,27 @@
* parity columns available.
*/
static void
-vdev_raidz_generate_parity(raidz_map_t *rm)
+vdev_raidz_generate_parity(raidz_row_t *rr)
{
- switch (rm->rm_firstdatacol) {
+ if (rr->rr_cols == 0) {
+ /*
+ * We are handling this block one row at a time (because
+ * this block has a different logical vs physical width,
+ * due to RAIDZ expansion), and this is a pad-only row,
+ * which has no parity.
+ */
+ return;
+ }
+
+ switch (rr->rr_firstdatacol) {
case 1:
- vdev_raidz_generate_parity_p(rm);
+ vdev_raidz_generate_parity_p(rr);
break;
case 2:
- vdev_raidz_generate_parity_pq(rm);
+ vdev_raidz_generate_parity_pq(rr);
break;
case 3:
- vdev_raidz_generate_parity_pqr(rm);
+ vdev_raidz_generate_parity_pqr(rr);
break;
default:
cmn_err(CE_PANIC, "invalid RAID-Z configuration");
@@ -929,30 +1122,31 @@
}
static int
-vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
+vdev_raidz_reconstruct_p(raidz_row_t *rr, int *tgts, int ntgts)
{
int x = tgts[0];
- int c;
abd_t *dst, *src;
- ASSERT(ntgts == 1);
- ASSERT(x >= rm->rm_firstdatacol);
- ASSERT(x < rm->rm_cols);
+ zfs_dbgmsg("reconstruct_p(rm=%p x=%u)",
+ rr, x);
+
+ ASSERT3U(ntgts, ==, 1);
+ ASSERT3U(x, >=, rr->rr_firstdatacol);
+ ASSERT3U(x, <, rr->rr_cols);
- ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_P].rc_size);
- ASSERT(rm->rm_col[x].rc_size > 0);
+ ASSERT3U(rr->rr_col[x].rc_size, <=, rr->rr_col[VDEV_RAIDZ_P].rc_size);
- src = rm->rm_col[VDEV_RAIDZ_P].rc_abd;
- dst = rm->rm_col[x].rc_abd;
+ src = rr->rr_col[VDEV_RAIDZ_P].rc_abd;
+ dst = rr->rr_col[x].rc_abd;
- abd_copy(dst, src, rm->rm_col[x].rc_size);
+ abd_copy(dst, src, rr->rr_col[x].rc_size);
- for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
- uint64_t size = MIN(rm->rm_col[x].rc_size,
- rm->rm_col[c].rc_size);
+ for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
+ uint64_t size = MIN(rr->rr_col[x].rc_size,
+ rr->rr_col[c].rc_size);
- src = rm->rm_col[c].rc_abd;
- dst = rm->rm_col[x].rc_abd;
+ src = rr->rr_col[c].rc_abd;
+ dst = rr->rr_col[x].rc_abd; /* XXX not needed, done above */
if (c == x)
continue;
@@ -965,51 +1159,54 @@
}
static int
-vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
+vdev_raidz_reconstruct_q(raidz_row_t *rr, int *tgts, int ntgts)
{
int x = tgts[0];
int c, exp;
abd_t *dst, *src;
+ zfs_dbgmsg("reconstruct_q(rm=%p x=%u)",
+ rr, x);
+
ASSERT(ntgts == 1);
- ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_Q].rc_size);
+ ASSERT(rr->rr_col[x].rc_size <= rr->rr_col[VDEV_RAIDZ_Q].rc_size);
- for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
- uint64_t size = (c == x) ? 0 : MIN(rm->rm_col[x].rc_size,
- rm->rm_col[c].rc_size);
+ for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
+ uint64_t size = (c == x) ? 0 : MIN(rr->rr_col[x].rc_size,
+ rr->rr_col[c].rc_size);
- src = rm->rm_col[c].rc_abd;
- dst = rm->rm_col[x].rc_abd;
+ src = rr->rr_col[c].rc_abd;
+ dst = rr->rr_col[x].rc_abd;
- if (c == rm->rm_firstdatacol) {
+ if (c == rr->rr_firstdatacol) {
abd_copy(dst, src, size);
- if (rm->rm_col[x].rc_size > size)
+ if (rr->rr_col[x].rc_size > size)
abd_zero_off(dst, size,
- rm->rm_col[x].rc_size - size);
+ rr->rr_col[x].rc_size - size);
} else {
- ASSERT3U(size, <=, rm->rm_col[x].rc_size);
+ ASSERT3U(size, <=, rr->rr_col[x].rc_size);
(void) abd_iterate_func2(dst, src, 0, 0, size,
vdev_raidz_reconst_q_pre_func, NULL);
(void) abd_iterate_func(dst,
- size, rm->rm_col[x].rc_size - size,
+ size, rr->rr_col[x].rc_size - size,
vdev_raidz_reconst_q_pre_tail_func, NULL);
}
}
- src = rm->rm_col[VDEV_RAIDZ_Q].rc_abd;
- dst = rm->rm_col[x].rc_abd;
- exp = 255 - (rm->rm_cols - 1 - x);
+ src = rr->rr_col[VDEV_RAIDZ_Q].rc_abd;
+ dst = rr->rr_col[x].rc_abd;
+ exp = 255 - (rr->rr_cols - 1 - x);
struct reconst_q_struct rq = { abd_to_buf(src), exp };
- (void) abd_iterate_func(dst, 0, rm->rm_col[x].rc_size,
+ (void) abd_iterate_func(dst, 0, rr->rr_col[x].rc_size,
vdev_raidz_reconst_q_post_func, &rq);
return (1 << VDEV_RAIDZ_Q);
}
static int
-vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
+vdev_raidz_reconstruct_pq(raidz_row_t *rr, int *tgts, int ntgts)
{
uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp;
abd_t *pdata, *qdata;
@@ -1018,12 +1215,15 @@
int y = tgts[1];
abd_t *xd, *yd;
+ zfs_dbgmsg("reconstruct_pq(rm=%p x=%u y=%u)",
+ rr, x, y);
+
ASSERT(ntgts == 2);
ASSERT(x < y);
- ASSERT(x >= rm->rm_firstdatacol);
- ASSERT(y < rm->rm_cols);
+ ASSERT(x >= rr->rr_firstdatacol);
+ ASSERT(y < rr->rr_cols);
- ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
+ ASSERT(rr->rr_col[x].rc_size >= rr->rr_col[y].rc_size);
/*
* Move the parity data aside -- we're going to compute parity as
@@ -1032,29 +1232,29 @@
* parity so we make those columns appear to be full of zeros by
* setting their lengths to zero.
*/
- pdata = rm->rm_col[VDEV_RAIDZ_P].rc_abd;
- qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_abd;
- xsize = rm->rm_col[x].rc_size;
- ysize = rm->rm_col[y].rc_size;
+ pdata = rr->rr_col[VDEV_RAIDZ_P].rc_abd;
+ qdata = rr->rr_col[VDEV_RAIDZ_Q].rc_abd;
+ xsize = rr->rr_col[x].rc_size;
+ ysize = rr->rr_col[y].rc_size;
- rm->rm_col[VDEV_RAIDZ_P].rc_abd =
- abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_P].rc_size, B_TRUE);
- rm->rm_col[VDEV_RAIDZ_Q].rc_abd =
- abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_Q].rc_size, B_TRUE);
- rm->rm_col[x].rc_size = 0;
- rm->rm_col[y].rc_size = 0;
+ rr->rr_col[VDEV_RAIDZ_P].rc_abd =
+ abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_P].rc_size, B_TRUE);
+ rr->rr_col[VDEV_RAIDZ_Q].rc_abd =
+ abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_Q].rc_size, B_TRUE);
+ rr->rr_col[x].rc_size = 0;
+ rr->rr_col[y].rc_size = 0;
- vdev_raidz_generate_parity_pq(rm);
+ vdev_raidz_generate_parity_pq(rr);
- rm->rm_col[x].rc_size = xsize;
- rm->rm_col[y].rc_size = ysize;
+ rr->rr_col[x].rc_size = xsize;
+ rr->rr_col[y].rc_size = ysize;
p = abd_to_buf(pdata);
q = abd_to_buf(qdata);
- pxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
- qxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
- xd = rm->rm_col[x].rc_abd;
- yd = rm->rm_col[y].rc_abd;
+ pxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
+ qxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
+ xd = rr->rr_col[x].rc_abd;
+ yd = rr->rr_col[y].rc_abd;
/*
* We now have:
@@ -1072,7 +1272,7 @@
*/
a = vdev_raidz_pow2[255 + x - y];
- b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
+ b = vdev_raidz_pow2[255 - (rr->rr_cols - 1 - x)];
tmp = 255 - vdev_raidz_log2[a ^ 1];
aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
@@ -1085,14 +1285,14 @@
(void) abd_iterate_func(xd, ysize, xsize - ysize,
vdev_raidz_reconst_pq_tail_func, &rpq);
- abd_free(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
- abd_free(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
+ abd_free(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
+ abd_free(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
/*
* Restore the saved parity data.
*/
- rm->rm_col[VDEV_RAIDZ_P].rc_abd = pdata;
- rm->rm_col[VDEV_RAIDZ_Q].rc_abd = qdata;
+ rr->rr_col[VDEV_RAIDZ_P].rc_abd = pdata;
+ rr->rr_col[VDEV_RAIDZ_Q].rc_abd = qdata;
return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
}
@@ -1249,13 +1449,13 @@
/* END CSTYLED */
static void
-vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
+vdev_raidz_matrix_init(raidz_row_t *rr, int n, int nmap, int *map,
uint8_t **rows)
{
int i, j;
int pow;
- ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
+ ASSERT(n == rr->rr_cols - rr->rr_firstdatacol);
/*
* Fill in the missing rows of interest.
@@ -1279,7 +1479,7 @@
}
static void
-vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
+vdev_raidz_matrix_invert(raidz_row_t *rr, int n, int nmissing, int *missing,
uint8_t **rows, uint8_t **invrows, const uint8_t *used)
{
int i, j, ii, jj;
@@ -1291,10 +1491,10 @@
* correspond to data columns.
*/
for (i = 0; i < nmissing; i++) {
- ASSERT3S(used[i], <, rm->rm_firstdatacol);
+ ASSERT3S(used[i], <, rr->rr_firstdatacol);
}
for (; i < n; i++) {
- ASSERT3S(used[i], >=, rm->rm_firstdatacol);
+ ASSERT3S(used[i], >=, rr->rr_firstdatacol);
}
/*
@@ -1311,8 +1511,8 @@
*/
for (i = 0; i < nmissing; i++) {
for (j = nmissing; j < n; j++) {
- ASSERT3U(used[j], >=, rm->rm_firstdatacol);
- jj = used[j] - rm->rm_firstdatacol;
+ ASSERT3U(used[j], >=, rr->rr_firstdatacol);
+ jj = used[j] - rr->rr_firstdatacol;
ASSERT3S(jj, <, n);
invrows[i][j] = rows[i][jj];
rows[i][jj] = 0;
@@ -1373,7 +1573,7 @@
}
static void
-vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
+vdev_raidz_matrix_reconstruct(raidz_row_t *rr, int n, int nmissing,
int *missing, uint8_t **invrows, const uint8_t *used)
{
int i, j, x, cc, c;
@@ -1405,22 +1605,24 @@
for (i = 0; i < n; i++) {
c = used[i];
- ASSERT3U(c, <, rm->rm_cols);
+ ASSERT3U(c, <, rr->rr_cols);
- src = abd_to_buf(rm->rm_col[c].rc_abd);
- ccount = rm->rm_col[c].rc_size;
+ ccount = rr->rr_col[c].rc_size;
+ ASSERT(ccount >= rr->rr_col[missing[0]].rc_size || i > 0);
+ if (ccount == 0)
+ continue;
+ src = abd_to_buf(rr->rr_col[c].rc_abd);
for (j = 0; j < nmissing; j++) {
- cc = missing[j] + rm->rm_firstdatacol;
- ASSERT3U(cc, >=, rm->rm_firstdatacol);
- ASSERT3U(cc, <, rm->rm_cols);
+ cc = missing[j] + rr->rr_firstdatacol;
+ ASSERT3U(cc, >=, rr->rr_firstdatacol);
+ ASSERT3U(cc, <, rr->rr_cols);
ASSERT3U(cc, !=, c);
- dst[j] = abd_to_buf(rm->rm_col[cc].rc_abd);
- dcount[j] = rm->rm_col[cc].rc_size;
+ dcount[j] = rr->rr_col[cc].rc_size;
+ if (dcount[j] != 0)
+ dst[j] = abd_to_buf(rr->rr_col[cc].rc_abd);
}
- ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
-
for (x = 0; x < ccount; x++, src++) {
if (*src != 0)
log = vdev_raidz_log2[*src];
@@ -1449,13 +1651,15 @@
}
static int
-vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
+vdev_raidz_reconstruct_general(raidz_row_t *rr, int *tgts, int ntgts)
{
int n, i, c, t, tt;
int nmissing_rows;
int missing_rows[VDEV_RAIDZ_MAXPARITY];
int parity_map[VDEV_RAIDZ_MAXPARITY];
+ zfs_dbgmsg("reconstruct_general(rm=%p ntgts=%u)",
+ rr, ntgts);
uint8_t *p, *pp;
size_t psize;
@@ -1471,28 +1675,31 @@
* Matrix reconstruction can't use scatter ABDs yet, so we allocate
* temporary linear ABDs.
*/
- if (!abd_is_linear(rm->rm_col[rm->rm_firstdatacol].rc_abd)) {
- bufs = kmem_alloc(rm->rm_cols * sizeof (abd_t *), KM_PUSHPAGE);
+ if (!abd_is_linear(rr->rr_col[rr->rr_firstdatacol].rc_abd)) {
+ bufs = kmem_alloc(rr->rr_cols * sizeof (abd_t *), KM_PUSHPAGE);
- for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
- raidz_col_t *col = &rm->rm_col[c];
+ for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
+ raidz_col_t *col = &rr->rr_col[c];
bufs[c] = col->rc_abd;
- col->rc_abd = abd_alloc_linear(col->rc_size, B_TRUE);
- abd_copy(col->rc_abd, bufs[c], col->rc_size);
+ if (bufs[c] != NULL) {
+ col->rc_abd =
+ abd_alloc_linear(col->rc_size, B_TRUE);
+ abd_copy(col->rc_abd, bufs[c], col->rc_size);
+ }
}
}
- n = rm->rm_cols - rm->rm_firstdatacol;
+ n = rr->rr_cols - rr->rr_firstdatacol;
/*
* Figure out which data columns are missing.
*/
nmissing_rows = 0;
for (t = 0; t < ntgts; t++) {
- if (tgts[t] >= rm->rm_firstdatacol) {
+ if (tgts[t] >= rr->rr_firstdatacol) {
missing_rows[nmissing_rows++] =
- tgts[t] - rm->rm_firstdatacol;
+ tgts[t] - rr->rr_firstdatacol;
}
}
@@ -1502,7 +1709,7 @@
*/
for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
ASSERT(tt < ntgts);
- ASSERT(c < rm->rm_firstdatacol);
+ ASSERT(c < rr->rr_firstdatacol);
/*
* Skip any targeted parity columns.
@@ -1537,9 +1744,9 @@
used[i] = parity_map[i];
}
- for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
+ for (tt = 0, c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
if (tt < nmissing_rows &&
- c == missing_rows[tt] + rm->rm_firstdatacol) {
+ c == missing_rows[tt] + rr->rr_firstdatacol) {
tt++;
continue;
}
@@ -1552,18 +1759,18 @@
/*
* Initialize the interesting rows of the matrix.
*/
- vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
+ vdev_raidz_matrix_init(rr, n, nmissing_rows, parity_map, rows);
/*
* Invert the matrix.
*/
- vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
+ vdev_raidz_matrix_invert(rr, n, nmissing_rows, missing_rows, rows,
invrows, used);
/*
* Reconstruct the missing data using the generated matrix.
*/
- vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
+ vdev_raidz_matrix_reconstruct(rr, n, nmissing_rows, missing_rows,
invrows, used);
kmem_free(p, psize);
@@ -1572,21 +1779,23 @@
* copy back from temporary linear abds and free them
*/
if (bufs) {
- for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
- raidz_col_t *col = &rm->rm_col[c];
+ for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
+ raidz_col_t *col = &rr->rr_col[c];
- abd_copy(bufs[c], col->rc_abd, col->rc_size);
- abd_free(col->rc_abd);
+ if (bufs[c] != NULL) {
+ abd_copy(bufs[c], col->rc_abd, col->rc_size);
+ abd_free(col->rc_abd);
+ }
col->rc_abd = bufs[c];
}
- kmem_free(bufs, rm->rm_cols * sizeof (abd_t *));
+ kmem_free(bufs, rr->rr_cols * sizeof (abd_t *));
}
return (code);
}
static int
-vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
+vdev_raidz_reconstruct_row(raidz_row_t *rr, int *t, int nt)
{
int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
int ntgts;
@@ -1595,26 +1804,37 @@
int nbadparity, nbaddata;
int parity_valid[VDEV_RAIDZ_MAXPARITY];
+ zfs_dbgmsg("reconstruct(rm=%p nt=%u cols=%u md=%u mp=%u)",
+ rr, nt, (int)rr->rr_cols, (int)rr->rr_missingdata, (int)rr->rr_missingparity);
+
/*
* The tgts list must already be sorted.
*/
+ zfs_dbgmsg("reconstruct(rm=%p t[%u]=%u)", rr, 0, t[0]);
for (i = 1; i < nt; i++) {
+ zfs_dbgmsg("reconstruct(rm=%p t[%u]=%u)",
+ rr, i, t[i]);
ASSERT(t[i] > t[i - 1]);
}
- nbadparity = rm->rm_firstdatacol;
- nbaddata = rm->rm_cols - nbadparity;
+ nbadparity = rr->rr_firstdatacol;
+ nbaddata = rr->rr_cols - nbadparity;
ntgts = 0;
- for (i = 0, c = 0; c < rm->rm_cols; c++) {
- if (c < rm->rm_firstdatacol)
+ for (i = 0, c = 0; c < rr->rr_cols; c++) {
+ zfs_dbgmsg("reconstruct(rm=%p col=%u devid=%u offset=%llx error=%u)",
+ rr, c,
+ (int)rr->rr_col[c].rc_devidx,
+ (long long)rr->rr_col[c].rc_offset,
+ (int)rr->rr_col[c].rc_error);
+ if (c < rr->rr_firstdatacol)
parity_valid[c] = B_FALSE;
if (i < nt && c == t[i]) {
tgts[ntgts++] = c;
i++;
- } else if (rm->rm_col[c].rc_error != 0) {
+ } else if (rr->rr_col[c].rc_error != 0) {
tgts[ntgts++] = c;
- } else if (c >= rm->rm_firstdatacol) {
+ } else if (c >= rr->rr_firstdatacol) {
nbaddata--;
} else {
parity_valid[c] = B_TRUE;
@@ -1635,30 +1855,30 @@
switch (nbaddata) {
case 1:
if (parity_valid[VDEV_RAIDZ_P])
- return (vdev_raidz_reconstruct_p(rm, dt, 1));
+ return (vdev_raidz_reconstruct_p(rr, dt, 1));
- ASSERT(rm->rm_firstdatacol > 1);
+ ASSERT(rr->rr_firstdatacol > 1);
if (parity_valid[VDEV_RAIDZ_Q])
- return (vdev_raidz_reconstruct_q(rm, dt, 1));
+ return (vdev_raidz_reconstruct_q(rr, dt, 1));
- ASSERT(rm->rm_firstdatacol > 2);
+ ASSERT(rr->rr_firstdatacol > 2);
break;
case 2:
- ASSERT(rm->rm_firstdatacol > 1);
+ ASSERT(rr->rr_firstdatacol > 1);
if (parity_valid[VDEV_RAIDZ_P] &&
parity_valid[VDEV_RAIDZ_Q])
- return (vdev_raidz_reconstruct_pq(rm, dt, 2));
+ return (vdev_raidz_reconstruct_pq(rr, dt, 2));
- ASSERT(rm->rm_firstdatacol > 2);
+ ASSERT(rr->rr_firstdatacol > 2);
break;
}
}
- code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
+ code = vdev_raidz_reconstruct_general(rr, tgts, ntgts);
ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
ASSERT(code > 0);
return (code);
@@ -1668,8 +1888,8 @@
vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
uint64_t *logical_ashift, uint64_t *physical_ashift)
{
- vdev_t *cvd;
- uint64_t nparity = vd->vdev_nparity;
+ vdev_raidz_t *vdrz = vd->vdev_tsd;
+ uint64_t nparity = vdrz->vd_nparity;
int c;
int lasterror = 0;
int numerrors = 0;
@@ -1685,7 +1905,7 @@
vdev_open_children(vd);
for (c = 0; c < vd->vdev_children; c++) {
- cvd = vd->vdev_child[c];
+ vdev_t *cvd = vd->vdev_child[c];
if (cvd->vdev_open_error != 0) {
lasterror = cvd->vdev_open_error;
@@ -1786,9 +2006,10 @@
vdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size,
uint64_t offset, uint64_t origoffset, boolean_t doread, boolean_t isdump)
{
+ vdev_raidz_t *vdrz = vd->vdev_tsd;
vdev_t *tvd = vd->vdev_top;
vdev_t *cvd;
- raidz_map_t *rm;
+ raidz_row_t *rr;
raidz_col_t *rc;
int c, err = 0;
@@ -1818,15 +2039,19 @@
*/
abd_t *abd = abd_get_from_buf(data - (offset - origoffset),
SPA_OLD_MAXBLOCKSIZE);
- rm = vdev_raidz_map_alloc(abd,
+ /*
+ * XXX deal with dump to expanded raidz
+ */
+ raidz_map_t *rm = vdev_raidz_map_alloc(abd,
SPA_OLD_MAXBLOCKSIZE, origoffset, B_FALSE, tvd->vdev_ashift,
- vd->vdev_children, vd->vdev_nparity);
+ vd->vdev_children, vdrz->vd_nparity);
+ rr = rm->rm_row[0];
coloffset = origoffset;
- for (c = rm->rm_firstdatacol; c < rm->rm_cols;
+ for (c = rr->rr_firstdatacol; c < rr->rr_cols;
c++, coloffset += rc->rc_size) {
- rc = &rm->rm_col[c];
+ rc = &rr->rr_col[c];
cvd = vd->vdev_child[rc->rc_devidx];
/*
@@ -1863,7 +2088,7 @@
break;
}
- vdev_raidz_map_free(rm);
+ vdev_raidz_row_free(rr);
abd_put(abd);
#endif /* KERNEL */
@@ -1874,10 +2099,11 @@
static uint64_t
vdev_raidz_asize(vdev_t *vd, uint64_t psize)
{
+ vdev_raidz_t *vdrz = vd->vdev_tsd;
uint64_t asize;
uint64_t ashift = vd->vdev_top->vdev_ashift;
- uint64_t cols = vd->vdev_children;
- uint64_t nparity = vd->vdev_nparity;
+ uint64_t cols = vdrz->vd_logical_width;
+ uint64_t nparity = vdrz->vd_nparity;
asize = ((psize - 1) >> ashift) + 1;
asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
@@ -1896,119 +2122,137 @@
rc->rc_skipped = 0;
}
-/*
- * Start an IO operation on a RAIDZ VDev
- *
- * Outline:
- * - For write operations:
- * 1. Generate the parity data
- * 2. Create child zio write operations to each column's vdev, for both
- * data and parity.
- * 3. If the column skips any sectors for padding, create optional dummy
- * write zio children for those areas to improve aggregation continuity.
- * - For read operations:
- * 1. Create child zio read operations to each data column's vdev to read
- * the range of data required for zio.
- * 2. If this is a scrub or resilver operation, or if any of the data
- * vdevs have had errors, then create zio read operations to the parity
- * columns' VDevs as well.
- */
static void
-vdev_raidz_io_start(zio_t *zio)
+vdev_raidz_io_verify(zio_t *zio, raidz_row_t *rr, int col)
{
+#if 0
+#ifdef ZFS_DEBUG
vdev_t *vd = zio->io_vd;
vdev_t *tvd = vd->vdev_top;
- vdev_t *cvd;
- raidz_map_t *rm;
- raidz_col_t *rc;
- int c, i;
- rm = vdev_raidz_map_alloc(zio->io_abd, zio->io_size, zio->io_offset,
- zio->io_type == ZIO_TYPE_FREE,
- tvd->vdev_ashift, vd->vdev_children,
- vd->vdev_nparity);
+ range_seg_t logical_rs, physical_rs;
+ logical_rs.rs_start = zio->io_offset;
+ logical_rs.rs_end = logical_rs.rs_start +
+ vdev_raidz_asize(zio->io_vd, zio->io_size);
- zio->io_vsd = rm;
- zio->io_vsd_ops = &vdev_raidz_vsd_ops;
+ raidz_col_t *rc = &rr->rr_col[col];
+ vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
- ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
+ vdev_xlate(cvd, &logical_rs, &physical_rs);
+ ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start);
+ ASSERT3U(rc->rc_offset, <, physical_rs.rs_end);
+ /*
+ * It would be nice to assert that rs_end is equal
+ * to rc_offset + rc_size but there might be an
+ * optional I/O at the end that is not accounted in
+ * rc_size.
+ */
+ if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) {
+ ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset +
+ rc->rc_size + (1 << tvd->vdev_ashift));
+ } else {
+ ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size);
+ }
+#endif
+#endif
+}
- if (zio->io_type == ZIO_TYPE_FREE) {
- for (c = 0; c < rm->rm_cols; c++) {
- rc = &rm->rm_col[c];
- cvd = vd->vdev_child[rc->rc_devidx];
- zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
- rc->rc_offset, rc->rc_abd, rc->rc_size,
- zio->io_type, zio->io_priority, 0,
- vdev_raidz_child_done, rc));
- }
+static void
+vdev_raidz_io_start_free(zio_t *zio, raidz_row_t *rr)
+{
+ vdev_t *vd = zio->io_vd;
+ vdev_t *tvd = vd->vdev_top;
- zio_execute(zio);
- return;
+ for (int c = 0; c < rr->rr_cols; c++) {
+ raidz_col_t *rc = &rr->rr_col[c];
+ if (rc->rc_size == 0)
+ continue;
+ vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
+
+ zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
+ rc->rc_offset, rc->rc_abd, rc->rc_size,
+ zio->io_type, zio->io_priority, 0,
+ vdev_raidz_child_done, rc));
}
+}
- if (zio->io_type == ZIO_TYPE_WRITE) {
- vdev_raidz_generate_parity(rm);
+static void
+vdev_raidz_io_start_write(zio_t *zio, raidz_row_t *rr)
+{
+ vdev_t *vd = zio->io_vd;
+ vdev_t *tvd = vd->vdev_top;
- for (c = 0; c < rm->rm_cols; c++) {
- rc = &rm->rm_col[c];
- cvd = vd->vdev_child[rc->rc_devidx];
- zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
- rc->rc_offset, rc->rc_abd, rc->rc_size,
- zio->io_type, zio->io_priority, 0,
- vdev_raidz_child_done, rc));
- }
+ vdev_raidz_generate_parity(rr);
- /*
- * Generate optional I/Os for any skipped sectors to improve
- * aggregation contiguity.
- */
- for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
- ASSERT(c <= rm->rm_scols);
- if (c == rm->rm_scols)
- c = 0;
- rc = &rm->rm_col[c];
- cvd = vd->vdev_child[rc->rc_devidx];
- zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
- rc->rc_offset + rc->rc_size, NULL,
- 1 << tvd->vdev_ashift,
- zio->io_type, zio->io_priority,
- ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
- }
+ for (int c = 0; c < rr->rr_cols; c++) {
+ raidz_col_t *rc = &rr->rr_col[c];
+ if (rc->rc_size == 0)
+ continue;
+ vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
- zio_execute(zio);
- return;
+ zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
+ rc->rc_offset, rc->rc_abd, rc->rc_size,
+ zio->io_type, zio->io_priority, 0,
+ vdev_raidz_child_done, rc));
+ }
+
+ /* XXX do this in vdev_raidz_io_start, based on nskip stored in rm
+ */
+#if 0
+ /*
+ * Generate optional I/Os for any skipped sectors to improve
+ * aggregation contiguity.
+ */
+ for (int c = rr->rm_skipstart, i = 0; i < rr->rm_nskip; c++, i++) {
+ ASSERT(c <= rr->rm_scols);
+ if (c == rr->rm_scols)
+ c = 0;
+ raidz_col_t *rc = &rr->rr_col[c];
+ vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
+ zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
+ rc->rc_offset + rc->rc_size, NULL,
+ 1 << tvd->vdev_ashift,
+ zio->io_type, zio->io_priority,
+ ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
}
+#endif
+}
- ASSERT(zio->io_type == ZIO_TYPE_READ);
+static void
+vdev_raidz_io_start_read(zio_t *zio, raidz_row_t *rr, boolean_t forceparity)
+{
+ vdev_t *vd = zio->io_vd;
/*
* Iterate over the columns in reverse order so that we hit the parity
* last -- any errors along the way will force us to read the parity.
*/
- for (c = rm->rm_cols - 1; c >= 0; c--) {
- rc = &rm->rm_col[c];
- cvd = vd->vdev_child[rc->rc_devidx];
+ for (int c = rr->rr_cols - 1; c >= 0; c--) {
+ raidz_col_t *rc = &rr->rr_col[c];
+ if (rc->rc_size == 0)
+ continue;
+ vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
if (!vdev_readable(cvd)) {
- if (c >= rm->rm_firstdatacol)
- rm->rm_missingdata++;
+ if (c >= rr->rr_firstdatacol)
+ rr->rr_missingdata++;
else
- rm->rm_missingparity++;
+ rr->rr_missingparity++;
rc->rc_error = SET_ERROR(ENXIO);
rc->rc_tried = 1; /* don't even try */
rc->rc_skipped = 1;
continue;
}
if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
- if (c >= rm->rm_firstdatacol)
- rm->rm_missingdata++;
+ if (c >= rr->rr_firstdatacol)
+ rr->rr_missingdata++;
else
- rm->rm_missingparity++;
+ rr->rr_missingparity++;
rc->rc_error = SET_ERROR(ESTALE);
rc->rc_skipped = 1;
continue;
}
- if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
+ if (forceparity ||
+ c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 ||
(zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
rc->rc_offset, rc->rc_abd, rc->rc_size,
@@ -2016,6 +2260,75 @@
vdev_raidz_child_done, rc));
}
}
+}
+
+/*
+ * Start an IO operation on a RAIDZ VDev
+ *
+ * Outline:
+ * - For write operations:
+ * 1. Generate the parity data
+ * 2. Create child zio write operations to each column's vdev, for both
+ * data and parity.
+ * 3. If the column skips any sectors for padding, create optional dummy
+ * write zio children for those areas to improve aggregation continuity.
+ * - For read operations:
+ * 1. Create child zio read operations to each data column's vdev to read
+ * the range of data required for zio.
+ * 2. If this is a scrub or resilver operation, or if any of the data
+ * vdevs have had errors, then create zio read operations to the parity
+ * columns' VDevs as well.
+ */
+static void
+vdev_raidz_io_start(zio_t *zio)
+{
+ vdev_t *vd = zio->io_vd;
+ vdev_t *tvd = vd->vdev_top;
+ vdev_raidz_t *vdrz = vd->vdev_tsd;
+ raidz_map_t *rm;
+
+ ASSERT(!vdrz->vn_expanding);
+
+ if (vdrz->vd_logical_width != vdrz->vd_physical_width) {
+ rm = vdev_raidz_map_alloc_expanded(zio->io_abd,
+ zio->io_size, zio->io_offset,
+ zio->io_type == ZIO_TYPE_FREE,
+ tvd->vdev_ashift, vdrz->vd_physical_width,
+ vdrz->vd_logical_width, vdrz->vd_nparity);
+ } else {
+ rm = vdev_raidz_map_alloc(zio->io_abd,
+ zio->io_size, zio->io_offset,
+ zio->io_type == ZIO_TYPE_FREE,
+ tvd->vdev_ashift, vdrz->vd_logical_width,
+ vdrz->vd_nparity);
+ }
+
+ zio->io_vsd = rm;
+ zio->io_vsd_ops = &vdev_raidz_vsd_ops;
+
+ if (zio->io_type == ZIO_TYPE_FREE) {
+ for (int i = 0; i < rm->rm_nrows; i++) {
+ vdev_raidz_io_start_free(zio, rm->rm_row[i]);
+ }
+ } else if (zio->io_type == ZIO_TYPE_WRITE) {
+ for (int i = 0; i < rm->rm_nrows; i++) {
+ vdev_raidz_io_start_write(zio,
+ rm->rm_row[i]);
+ }
+ } else {
+ ASSERT(zio->io_type == ZIO_TYPE_READ);
+ /*
+ * If there are multiple rows, we will be hitting
+ * all disks, so go ahead and read the parity so
+ * that we are reading in decent size chunks.
+ * XXX maybe doesn't really matter?
+ */
+ boolean_t forceparity = rm->rm_nrows > 1;
+ for (int i = 0; i < rm->rm_nrows; i++) {
+ vdev_raidz_io_start_read(zio,
+ rm->rm_row[i], forceparity);
+ }
+ }
zio_execute(zio);
}
@@ -2070,10 +2383,10 @@
* Generate the parity from the data columns. If we tried and were able to
* read the parity without error, verify that the generated parity matches the
* data we read. If it doesn't, we fire off a checksum error. Return the
- * number such failures.
+ * number of such failures.
*/
static int
-raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
+raidz_parity_verify(zio_t *zio, raidz_row_t *rr)
{
void *orig[VDEV_RAIDZ_MAXPARITY];
int c, ret = 0;
@@ -2086,21 +2399,29 @@
if (checksum == ZIO_CHECKSUM_NOPARITY)
return (ret);
- for (c = 0; c < rm->rm_firstdatacol; c++) {
- rc = &rm->rm_col[c];
+ for (c = 0; c < rr->rr_firstdatacol; c++) {
+ rc = &rr->rr_col[c];
if (!rc->rc_tried || rc->rc_error != 0)
continue;
orig[c] = zio_buf_alloc(rc->rc_size);
abd_copy_to_buf(orig[c], rc->rc_abd, rc->rc_size);
}
- vdev_raidz_generate_parity(rm);
+ /* XXX regenerates parity even for !tried||rc_error!=0
+ * This could cause a side effect of fixing stuff we didn't realize
+ * was necessary (i.e. even if we return 0)
+ */
+ vdev_raidz_generate_parity(rr);
+
+ for (c = 0; c < rr->rr_firstdatacol; c++) {
+ rc = &rr->rr_col[c];
- for (c = 0; c < rm->rm_firstdatacol; c++) {
- rc = &rm->rm_col[c];
if (!rc->rc_tried || rc->rc_error != 0)
continue;
+
if (abd_cmp_buf(rc->rc_abd, orig[c], rc->rc_size) != 0) {
+ zfs_dbgmsg("raidz_parity_verify found error on col=%u devidx=%u",
+ c, (int)rc->rc_devidx);
raidz_checksum_error(zio, rc, orig[c]);
rc->rc_error = SET_ERROR(ECKSUM);
ret++;
@@ -2117,16 +2438,83 @@
static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
static int
-vdev_raidz_worst_error(raidz_map_t *rm)
+vdev_raidz_worst_error(raidz_row_t *rr)
{
int error = 0;
- for (int c = 0; c < rm->rm_cols; c++)
- error = zio_worst_error(error, rm->rm_col[c].rc_error);
+ for (int c = 0; c < rr->rr_cols; c++)
+ error = zio_worst_error(error, rr->rr_col[c].rc_error);
return (error);
}
+static void
+vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr)
+{
+ int unexpected_errors = 0;
+ int parity_errors = 0;
+ int parity_untried = 0;
+ int data_errors = 0;
+
+ ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
+
+ for (int c = 0; c < rr->rr_cols; c++) {
+ raidz_col_t *rc = &rr->rr_col[c];
+
+ if (rc->rc_error) {
+ if (c < rr->rr_firstdatacol)
+ parity_errors++;
+ else
+ data_errors++;
+
+ if (!rc->rc_skipped)
+ unexpected_errors++;
+ } else if (c < rr->rr_firstdatacol && !rc->rc_tried) {
+ parity_untried++;
+ }
+ }
+
+ /*
+ * If we read more parity disks than were used for
+ * reconstruction, confirm that the other parity disks produced
+ * correct data.
+ *
+ * Note that we also regenerate parity when resilvering so we
+ * can write it out to failed devices later.
+ */
+ zfs_dbgmsg("parity_errors=%u parity_untried=%u data_errors=%u verifying=%s",
+ parity_errors, parity_untried, data_errors,
+ (parity_errors + parity_untried < rr->rr_firstdatacol - data_errors) ? "yes" : "no");
+ if (parity_errors + parity_untried <
+ rr->rr_firstdatacol - data_errors ||
+ (zio->io_flags & ZIO_FLAG_RESILVER)) {
+ int n = raidz_parity_verify(zio, rr);
+ unexpected_errors += n;
+ ASSERT3U(parity_errors + n, <=, rr->rr_firstdatacol);
+ }
+
+ if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
+ (unexpected_errors > 0 || (zio->io_flags & ZIO_FLAG_RESILVER))) {
+ /*
+ * Use the good data we have in hand to repair damaged children.
+ */
+ for (int c = 0; c < rr->rr_cols; c++) {
+ raidz_col_t *rc = &rr->rr_col[c];
+ vdev_t *vd = zio->io_vd;
+ vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
+
+ if (rc->rc_error == 0 || rc->rc_size == 0)
+ continue;
+
+ zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
+ rc->rc_offset, rc->rc_abd, rc->rc_size,
+ ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
+ ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
+ ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
+ }
+ }
+}
+
/*
* Iterate over all combinations of bad data and attempt a reconstruction.
* Note that the algorithm below is non-optimal because it doesn't take into
@@ -2134,454 +2522,771 @@
* triple-parity RAID-Z the reconstruction procedure is the same if column 4
* is targeted as invalid as if columns 1 and 4 are targeted since in both
* cases we'd only use parity information in column 0.
+ *
+ * The order that we find the various possible combinations of failed
+ * disks is dictated by these rules:
+ * - Examine each "slot" (the "i" in tgts[i])
+ * - Try to increment this slot (tgts[i] = tgts[i] + 1)
+ * - if we can't increment because it runs into the next slot,
+ * reset our slot to the minimum, and examine the next slot
+ * For example, with a 6-wide RAIDZ3, and no known errors (so we have to choose
+ * 3 columns to reconstruct), we will generate the following sequence:
+ *
+ * STATE ACTION
+ * 0 1 2 special case: skip since these are all parity
+ * 0 1 3 first slot: reset to 0; middle slot: increment to 2
+ * 0 2 3 first slot: increment to 1
+ * 1 2 3 first: reset to 0; middle: reset to 1; last: increment to 4
+ * 0 1 4 first: reset to 0; middle: increment to 2
+ * 0 2 4 first: increment to 1
+ * 1 2 4 first: reset to 0; middle: increment to 3
+ * 0 3 4 first: increment to 1
+ * 1 3 4 first: increment to 2
+ * 2 3 4 first: reset to 0; middle: reset to 1; last: increment to 5
+ * 0 1 5 first: reset to 0; middle: increment to 2
+ * 0 2 5 first: increment to 1
+ * 1 2 5 first: reset to 0; middle: increment to 3
+ * 0 3 5 first: increment to 1
+ * 1 3 5 first: increment to 2
+ * 2 3 5 first: reset to 0; middle: increment to 4
+ * 0 4 5 first: increment to 1
+ * 1 4 5 first: increment to 2
+ * 2 4 5 first: increment to 3
+ * 3 4 5 done
*/
-static int
-vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
-{
- raidz_map_t *rm = zio->io_vsd;
- raidz_col_t *rc;
- void *orig[VDEV_RAIDZ_MAXPARITY];
- int tstore[VDEV_RAIDZ_MAXPARITY + 2];
- int *tgts = &tstore[1];
- int current, next, i, c, n;
- int code, ret = 0;
- ASSERT(total_errors < rm->rm_firstdatacol);
+/*
+ * Should this sector be considered failed for logical child ID i?
+ * XXX comment explaining logical child ID's
+ */
+static boolean_t
+raidz_simulate_failure(vdev_raidz_t *vdrz, int ashift, int i, raidz_col_t *rc)
+{
+ uint64_t sector_id =
+ vdrz->vd_physical_width * (rc->rc_offset >> ashift) +
+ rc->rc_devidx;
+
+#if 0
+ zfs_dbgmsg("raidz_simulate_failure(pw=%u lw=%u ashift=%u i=%u rc_offset=%llx rc_devidx=%u sector_id=%u",
+ vdrz->vd_physical_width,
+ vdrz->vd_logical_width,
+ ashift,
+ i,
+ (long long)rc->rc_offset,
+ (int)rc->rc_devidx,
+ (long long)sector_id);
+#endif
- /*
- * This simplifies one edge condition.
- */
- tgts[-1] = -1;
+ for (int w = vdrz->vd_physical_width;
+ w >= vdrz->vd_logical_width; w--) {
+ if (i < w) {
+ return (sector_id % w == i);
+ } else {
+ i -= w;
+ }
+ }
+ ASSERT(!"invalid logical child id");
+ return (B_FALSE);
+}
- for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
- /*
- * Initialize the targets array by finding the first n columns
- * that contain no error.
- *
- * If there were no data errors, we need to ensure that we're
- * always explicitly attempting to reconstruct at least one
- * data column. To do this, we simply push the highest target
- * up into the data columns.
- */
- for (c = 0, i = 0; i < n; i++) {
- if (i == n - 1 && data_errors == 0 &&
- c < rm->rm_firstdatacol) {
- c = rm->rm_firstdatacol;
+static void
+raidz_restore_orig_data(raidz_map_t *rm)
+{
+ for (int i = 0; i < rm->rm_nrows; i++) {
+ raidz_row_t *rr = rm->rm_row[i];
+ for (int c = 0; c < rr->rr_cols; c++) {
+ raidz_col_t *rc = &rr->rr_col[c];
+ if (rc->rc_need_orig_restore) {
+ abd_copy_from_buf(rc->rc_abd,
+ rc->rc_orig_data, rc->rc_size);
+ rc->rc_need_orig_restore = B_FALSE;
}
+ }
+ }
+}
- while (rm->rm_col[c].rc_error != 0) {
- c++;
- ASSERT3S(c, <, rm->rm_cols);
+/*
+ * returns EINVAL if reconstruction of the block will not be possible
+ * returns ECKSUM if this specific reconstruction failed
+ * returns 0 on successful reconstruction
+ */
+static int
+raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts)
+{
+ raidz_map_t *rm = zio->io_vsd;
+ vdev_raidz_t *vdrz = zio->io_vd->vdev_tsd;
+
+ zfs_dbgmsg("raidz_reconstruct_expanded(zio=%p ltgts=%u,%u,%u ntgts=%u",
+ zio, ltgts[0], ltgts[1], ltgts[2], ntgts);
+
+ /* Reconstruct each row */
+ for (int r = 0; r < rm->rm_nrows; r++) {
+ raidz_row_t *rr = rm->rm_row[r];
+ int my_tgts[VDEV_RAIDZ_MAXPARITY]; /* value is child id */
+ int t = 0;
+ int dead = 0;
+ int dead_data = 0;
+
+ zfs_dbgmsg("raidz_reconstruct_expanded(row=%u)",
+ r);
+
+ for (int c = 0; c < rr->rr_cols; c++) {
+ raidz_col_t *rc = &rr->rr_col[c];
+ ASSERT0(rc->rc_need_orig_restore);
+ if (rc->rc_error != 0) {
+ dead++;
+ if (c >= vdrz->vd_nparity)
+ dead_data++;
+ continue;
+ }
+ if (rc->rc_size == 0)
+ continue;
+ for (int lt = 0; lt < ntgts; lt++) {
+ if (raidz_simulate_failure(vdrz,
+ zio->io_vd->vdev_top->vdev_ashift,
+ ltgts[lt], rc)) {
+ if (rc->rc_orig_data == NULL) {
+ rc->rc_orig_data =
+ zio_buf_alloc(rc->rc_size);
+ abd_copy_to_buf(rc->rc_orig_data,
+ rc->rc_abd, rc->rc_size);
+ }
+ rc->rc_need_orig_restore = B_TRUE;
+
+ dead++;
+ if (c >= vdrz->vd_nparity)
+ dead_data++;
+ my_tgts[t++] = c;
+ zfs_dbgmsg("simulating failure of col %u devidx %u",
+ c, (int)rc->rc_devidx);
+ break;
+ }
}
-
- tgts[i] = c++;
}
-
- /*
- * Setting tgts[n] simplifies the other edge condition.
- */
- tgts[n] = rm->rm_cols;
-
- /*
- * These buffers were allocated in previous iterations.
- */
- for (i = 0; i < n - 1; i++) {
- ASSERT(orig[i] != NULL);
+ if (dead > vdrz->vd_nparity) {
+ /* reconstruction not possible */
+ zfs_dbgmsg("reconstruction not possible; too many failures");
+ raidz_restore_orig_data(rm);
+ return (EINVAL);
}
+ rr->rr_code = 0;
+ if (dead_data > 0)
+ rr->rr_code = vdev_raidz_reconstruct_row(rr, my_tgts, t);
+ }
- orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
+ /* Check for success */
+ if (raidz_checksum_verify(zio) == 0) {
+
+ /* Reconstruction succeeded - report errors */
+ for (int i = 0; i < rm->rm_nrows; i++) {
+ raidz_row_t *rr = rm->rm_row[i];
+ atomic_inc_64(&raidz_corrected[rr->rr_code]);
+
+ for (int c = 0; c < rr->rr_cols; c++) {
+ raidz_col_t *rc = &rr->rr_col[c];
+ if (rc->rc_need_orig_restore) {
+ /*
+ * Note: if this is a parity column,
+ * we don't really know if it's wrong.
+ * We need to let
+ * vdev_raidz_io_done_verified() check
+ * it, and if we set rc_error, it will
+ * think that it is a "known" error
+ * that doesn't need to be checked
+ * or corrected.
+ */
+ if (rc->rc_error == 0 && c >= rr->rr_firstdatacol) {
+ raidz_checksum_error(zio, rc, rc->rc_gdata);
+ rc->rc_error = SET_ERROR(ECKSUM);
+ }
+ rc->rc_need_orig_restore = B_FALSE;
+ }
+ }
- current = 0;
- next = tgts[current];
+ vdev_raidz_io_done_verified(zio, rr);
+ }
- while (current != n) {
- tgts[current] = next;
- current = 0;
+ zio_checksum_verified(zio);
- /*
- * Save off the original data that we're going to
- * attempt to reconstruct.
- */
- for (i = 0; i < n; i++) {
- ASSERT(orig[i] != NULL);
- c = tgts[i];
- ASSERT3S(c, >=, 0);
- ASSERT3S(c, <, rm->rm_cols);
- rc = &rm->rm_col[c];
- abd_copy_to_buf(orig[i], rc->rc_abd,
- rc->rc_size);
- }
+ zfs_dbgmsg("reconstruction successful (checksum verified)");
+ return (0);
+ }
- /*
- * Attempt a reconstruction and exit the outer loop on
- * success.
- */
- code = vdev_raidz_reconstruct(rm, tgts, n);
- if (raidz_checksum_verify(zio) == 0) {
- atomic_inc_64(&raidz_corrected[code]);
-
- for (i = 0; i < n; i++) {
- c = tgts[i];
- rc = &rm->rm_col[c];
- ASSERT(rc->rc_error == 0);
- if (rc->rc_tried)
- raidz_checksum_error(zio, rc,
- orig[i]);
- rc->rc_error = SET_ERROR(ECKSUM);
- }
+ /* Reconstruction failed - restore original data */
+ raidz_restore_orig_data(rm);
+ zfs_dbgmsg("raidz_reconstruct_expanded(zio=%p) checksum failed",
+ zio);
+ return (ECKSUM);
+}
- ret = code;
- goto done;
- }
+/*
+ * return 0 on success, ECKSUM on failure
+ */
+static int
+vdev_raidz_combrec(zio_t *zio)
+{
+ raidz_map_t *rm = zio->io_vsd;
+ vdev_raidz_t *vdrz = zio->io_vd->vdev_tsd;
+
+ for (int num_failures = 1; num_failures <= vdrz->vd_nparity;
+ num_failures++) {
+ int tstore[VDEV_RAIDZ_MAXPARITY + 2];
+ int *ltgts = &tstore[1]; /* value is logical child ID */
+
+ /* Determine number of logical children, n */
+ int n = 0;
+ for (int w = vdrz->vd_physical_width;
+ w >= vdrz->vd_logical_width; w--) {
+ n += w;
+ }
- /*
- * Restore the original data.
- */
- for (i = 0; i < n; i++) {
- c = tgts[i];
- rc = &rm->rm_col[c];
- abd_copy_from_buf(rc->rc_abd, orig[i],
- rc->rc_size);
- }
+ ASSERT3U(num_failures, <=, vdrz->vd_nparity);
+ ASSERT3U(num_failures, <=, VDEV_RAIDZ_MAXPARITY);
+ /* handle corner cases in combrec logic */
+ ltgts[-1] = -1;
+ for (int i = 0; i < num_failures; i++) {
+ ltgts[i] = i;
+ }
+ ltgts[num_failures] = n;
- do {
+ for (;;) {
+ int err = raidz_reconstruct(zio,
+ ltgts, num_failures);
+ if (err == EINVAL) {
/*
- * Find the next valid column after the current
- * position..
+ * Reconstruction not possible with this #
+ * failures; try more failures.
*/
- for (next = tgts[current] + 1;
- next < rm->rm_cols &&
- rm->rm_col[next].rc_error != 0; next++)
- continue;
+ break;
+ } else if (err == 0)
+ return (0);
+
+ /* Compute next targets to try */
+ for (int t = 0; ; t++) {
+ ASSERT3U(t, <, num_failures);
+ ltgts[t]++;
+ if (ltgts[t] == n) {
+ ASSERT3U(t, ==, num_failures - 1);
+ zfs_dbgmsg("reconstruction failed for num_failures=%u; tried all combinations",
+ num_failures);
+ break; // try more failures
+ }
- ASSERT(next <= tgts[current + 1]);
+ ASSERT3U(ltgts[t], <, n);
+ ASSERT3U(ltgts[t], <=, ltgts[t + 1]);
/*
* If that spot is available, we're done here.
*/
- if (next != tgts[current + 1])
- break;
+ if (ltgts[t] != ltgts[t + 1])
+ break; // found next combination
/*
- * Otherwise, find the next valid column after
- * the previous position.
+ * Otherwise, reset this tgt to the minimum,
+ * and move on to the next tgt.
*/
- for (c = tgts[current - 1] + 1;
- rm->rm_col[c].rc_error != 0; c++)
- continue;
-
- tgts[current] = c;
- current++;
-
- } while (current != n);
+ ltgts[t] = ltgts[t - 1] + 1;
+ ASSERT3U(ltgts[t], ==, t);
+ }
+ if (ltgts[num_failures - 1] == n)
+ break; // try more failures
}
}
- n--;
-done:
- for (i = 0; i < n; i++) {
- zio_buf_free(orig[i], rm->rm_col[0].rc_size);
- }
-
- return (ret);
+ zfs_dbgmsg("reconstruction failed for all num_failures");
+ return (ECKSUM);
}
/*
- * Complete an IO operation on a RAIDZ VDev
+ * Complete a write IO operation on a RAIDZ VDev
*
* Outline:
- * - For write operations:
* 1. Check for errors on the child IOs.
* 2. Return, setting an error code if too few child VDevs were written
* to reconstruct the data later. Note that partial writes are
* considered successful if they can be reconstructed at all.
- * - For read operations:
- * 1. Check for errors on the child IOs.
- * 2. If data errors occurred:
- * a. Try to reassemble the data from the parity available.
- * b. If we haven't yet read the parity drives, read them now.
- * c. If all parity drives have been read but the data still doesn't
- * reassemble with a correct checksum, then try combinatorial
- * reconstruction.
- * d. If that doesn't work, return an error.
- * 3. If there were unexpected errors or this is a resilver operation,
- * rewrite the vdevs that had errors.
*/
static void
-vdev_raidz_io_done(zio_t *zio)
+vdev_raidz_io_done_write_impl(zio_t *zio, raidz_row_t *rr)
+{
+ int total_errors = 0;
+
+ ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol);
+ ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol);
+ ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
+
+ for (int c = 0; c < rr->rr_cols; c++) {
+ raidz_col_t *rc = &rr->rr_col[c];
+
+ if (rc->rc_error) {
+ ASSERT(rc->rc_error != ECKSUM); /* child has no bp */
+
+ total_errors++;
+ }
+ }
+
+ /*
+ * XXX -- for now, treat partial writes as a success.
+ * (If we couldn't write enough columns to reconstruct
+ * the data, the I/O failed. Otherwise, good enough.)
+ *
+ * Now that we support write reallocation, it would be better
+ * to treat partial failure as real failure unless there are
+ * no non-degraded top-level vdevs left, and not update DTLs
+ * if we intend to reallocate.
+ */
+ /* XXPOLICY */
+ if (total_errors > rr->rr_firstdatacol) {
+ zio->io_error = zio_worst_error(zio->io_error,
+ vdev_raidz_worst_error(rr));
+ }
+}
+
+/*
+ * return 0 if no reconstruction occurred, otherwise the "code" from
+ * vdev_raidz_reconstruct().
+ */
+static int
+vdev_raidz_io_done_reconstruct_known_missing(zio_t *zio, raidz_row_t *rr)
{
- vdev_t *vd = zio->io_vd;
- vdev_t *cvd;
- raidz_map_t *rm = zio->io_vsd;
- raidz_col_t *rc;
- int unexpected_errors = 0;
int parity_errors = 0;
int parity_untried = 0;
int data_errors = 0;
int total_errors = 0;
- int n, c;
- int tgts[VDEV_RAIDZ_MAXPARITY];
- int code;
-
- ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */
+ int code = 0;
- ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
- ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
+ ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol);
+ ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol);
+ ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
- for (c = 0; c < rm->rm_cols; c++) {
- rc = &rm->rm_col[c];
+ for (int c = 0; c < rr->rr_cols; c++) {
+ raidz_col_t *rc = &rr->rr_col[c];
if (rc->rc_error) {
ASSERT(rc->rc_error != ECKSUM); /* child has no bp */
- if (c < rm->rm_firstdatacol)
+ if (c < rr->rr_firstdatacol)
parity_errors++;
else
data_errors++;
- if (!rc->rc_skipped)
- unexpected_errors++;
-
total_errors++;
- } else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
+ } else if (c < rr->rr_firstdatacol && !rc->rc_tried) {
parity_untried++;
}
}
- if (zio->io_type == ZIO_TYPE_WRITE) {
+ /*
+ * If there were data errors and the number of errors we saw was
+ * correctable -- less than or equal to the number of parity disks read
+ * -- reconstruct based on the missing data.
+ */
+ if (data_errors != 0 &&
+ total_errors <= rr->rr_firstdatacol - parity_untried) {
/*
- * XXX -- for now, treat partial writes as a success.
- * (If we couldn't write enough columns to reconstruct
- * the data, the I/O failed. Otherwise, good enough.)
- *
- * Now that we support write reallocation, it would be better
- * to treat partial failure as real failure unless there are
- * no non-degraded top-level vdevs left, and not update DTLs
- * if we intend to reallocate.
+ * We either attempt to read all the parity columns or
+ * none of them. If we didn't try to read parity, we
+ * wouldn't be here in the correctable case. There must
+ * also have been fewer parity errors than parity
+ * columns or, again, we wouldn't be in this code path.
*/
- /* XXPOLICY */
- if (total_errors > rm->rm_firstdatacol)
- zio->io_error = vdev_raidz_worst_error(rm);
+ ASSERT(parity_untried == 0);
+ ASSERT(parity_errors < rr->rr_firstdatacol);
- return;
- } else if (zio->io_type == ZIO_TYPE_FREE) {
- return;
+ /*
+ * Identify the data columns that reported an error.
+ */
+ int n = 0;
+ int tgts[VDEV_RAIDZ_MAXPARITY];
+ for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
+ raidz_col_t *rc = &rr->rr_col[c];
+ if (rc->rc_error != 0) {
+ ASSERT(n < VDEV_RAIDZ_MAXPARITY);
+ tgts[n++] = c;
+ }
+ }
+
+ ASSERT(rr->rr_firstdatacol >= n);
+
+ code = vdev_raidz_reconstruct_row(rr, tgts, n);
}
- ASSERT(zio->io_type == ZIO_TYPE_READ);
- /*
- * There are three potential phases for a read:
- * 1. produce valid data from the columns read
- * 2. read all disks and try again
- * 3. perform combinatorial reconstruction
- *
- * Each phase is progressively both more expensive and less likely to
- * occur. If we encounter more errors than we can repair or all phases
- * fail, we have no choice but to return an error.
- */
+ return (code);
+}
- /*
- * If the number of errors we saw was correctable -- less than or equal
- * to the number of parity disks read -- attempt to produce data that
- * has a valid checksum. Naturally, this case applies in the absence of
- * any errors.
- */
- if (total_errors <= rm->rm_firstdatacol - parity_untried) {
- if (data_errors == 0) {
- if (raidz_checksum_verify(zio) == 0) {
- /*
- * If we read parity information (unnecessarily
- * as it happens since no reconstruction was
- * needed) regenerate and verify the parity.
- * We also regenerate parity when resilvering
- * so we can write it out to the failed device
- * later.
- */
- if (parity_errors + parity_untried <
- rm->rm_firstdatacol ||
- (zio->io_flags & ZIO_FLAG_RESILVER)) {
- n = raidz_parity_verify(zio, rm);
- unexpected_errors += n;
- ASSERT(parity_errors + n <=
- rm->rm_firstdatacol);
- }
- goto done;
+/*
+ * return the number of reads issued.
+ */
+static int
+vdev_raidz_read_all(zio_t *zio, raidz_row_t *rr)
+{
+ vdev_t *vd = zio->io_vd;
+ int nread = 0;
+
+ rr->rr_missingdata = 0;
+ rr->rr_missingparity = 0;
+
+ for (int c = 0; c < rr->rr_cols; c++) {
+ raidz_col_t *rc = &rr->rr_col[c];
+ if (rc->rc_tried || rc->rc_size == 0)
+ continue;
+
+ zio_nowait(zio_vdev_child_io(zio, NULL,
+ vd->vdev_child[rc->rc_devidx],
+ rc->rc_offset, rc->rc_abd, rc->rc_size,
+ zio->io_type, zio->io_priority, 0,
+ vdev_raidz_child_done, rc));
+ nread++;
+ }
+ return (nread);
+}
+
+static void
+vdev_raidz_io_done(zio_t *zio)
+{
+ raidz_map_t *rm = zio->io_vsd;
+ vdev_raidz_t *vdrz = zio->io_vd->vdev_tsd;
+
+ ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */
+ if (zio->io_type == ZIO_TYPE_WRITE) {
+ for (int i = 0; i < rm->rm_nrows; i++) {
+ vdev_raidz_io_done_write_impl(zio, rm->rm_row[i]);
+ }
+ } else if (zio->io_type == ZIO_TYPE_FREE) {
+ return;
+ } else {
+ for (int i = 0; i < rm->rm_nrows; i++) {
+ raidz_row_t *rr = rm->rm_row[i];
+ rr->rr_code =
+ vdev_raidz_io_done_reconstruct_known_missing(zio,
+ rr);
+ }
+
+ if (raidz_checksum_verify(zio) == 0) {
+ for (int i = 0; i < rm->rm_nrows; i++) {
+ raidz_row_t *rr = rm->rm_row[i];
+ atomic_inc_64(&raidz_corrected[rr->rr_code]);
+ vdev_raidz_io_done_verified(zio, rr);
}
+ zio_checksum_verified(zio);
} else {
/*
- * We either attempt to read all the parity columns or
- * none of them. If we didn't try to read parity, we
- * wouldn't be here in the correctable case. There must
- * also have been fewer parity errors than parity
- * columns or, again, we wouldn't be in this code path.
+ * This isn't a typical situation -- either we got a
+ * read error or a child silently returned bad data.
+ * Read every block so we can try again with as much
+ * data and parity as we can track down. If we've
+ * already been through once before, all children will
+ * be marked as tried so we'll proceed to combinatorial
+ * reconstruction.
*/
- ASSERT(parity_untried == 0);
- ASSERT(parity_errors < rm->rm_firstdatacol);
-
+ int nread = 0;
+ for (int i = 0; i < rm->rm_nrows; i++) {
+ nread += vdev_raidz_read_all(zio,
+ rm->rm_row[i]);
+ }
+ if (nread != 0) {
+ /*
+ * Normally our stage is VDEV_IO_DONE, but if
+ * we've already called redone(), it will have
+ * changed to VDEV_IO_START, in which case we
+ * don't want to call redone() again.
+ */
+ if (zio->io_stage != ZIO_STAGE_VDEV_IO_START)
+ zio_vdev_io_redone(zio);
+ return;
+ }
/*
- * Identify the data columns that reported an error.
+ * It would be too expensive to try every possible
+ * combination of failed sectors in every row, so
+ * instead we try every combination of failed current or
+ * past physical disk. This means that if the incorrect
+ * sectors were all on Nparity disks at any point in the
+ * past, we will find the correct data. I think that
+ * the only case where this is less durable than
+ * a non-expanded RAIDZ, is if we have a silent
+ * failure during expansion. In that case, one block
+ * could be partially in the old format and partially
+ * in the new format, so we'd lost some sectors
+ * from the old format and some from the new format.
+ *
+ * e.g. logical_width=4 physical_width=6
+ * the 15 (6+5+4) possible failed disks are:
+ * width=6 child=0
+ * width=6 child=1
+ * width=6 child=2
+ * width=6 child=3
+ * width=6 child=4
+ * width=6 child=5
+ * width=5 child=0
+ * width=5 child=1
+ * width=5 child=2
+ * width=5 child=3
+ * width=5 child=4
+ * width=4 child=0
+ * width=4 child=1
+ * width=4 child=2
+ * width=4 child=3
+ * And we will try every combination of Nparity of these
+ * failing.
+ *
+ * As a first pass, we can generate every combo,
+ * and try reconstructing, ignoring any known
+ * failures. If any row has too many known + simulated
+ * failures, then we bail on reconstructing with this
+ * number of simulated failures. As an improvement,
+ * we could detect the number of whole known failures
+ * (i.e. we have known failures on these disks for
+ * every row; the disks never succeeded), and
+ * subtract that from the max # failures to simulate.
+ * We could go even further like the current
+ * combrec code, but that doesn't seem like it
+ * gains us very much. If we simulate a failure
+ * that is also a known failure, that's fine.
*/
- n = 0;
- for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
- rc = &rm->rm_col[c];
- if (rc->rc_error != 0) {
- ASSERT(n < VDEV_RAIDZ_MAXPARITY);
- tgts[n++] = c;
- }
- }
-
- ASSERT(rm->rm_firstdatacol >= n);
-
- code = vdev_raidz_reconstruct(rm, tgts, n);
-
- if (raidz_checksum_verify(zio) == 0) {
- atomic_inc_64(&raidz_corrected[code]);
-
+ if (vdev_raidz_combrec(zio) != 0) {
/*
- * If we read more parity disks than were used
- * for reconstruction, confirm that the other
- * parity disks produced correct data. This
- * routine is suboptimal in that it regenerates
- * the parity that we already used in addition
- * to the parity that we're attempting to
- * verify, but this should be a relatively
- * uncommon case, and can be optimized if it
- * becomes a problem. Note that we regenerate
- * parity when resilvering so we can write it
- * out to failed devices later.
+ * We're here because either:
+ *
+ * total_errors == rm_first_datacol, or
+ * vdev_raidz_combrec() failed
+ *
+ * In either case, there is enough bad data to prevent
+ * reconstruction.
+ *
+ * Start checksum ereports for all children which haven't
+ * failed, and the IO wasn't speculative.
*/
- if (parity_errors < rm->rm_firstdatacol - n ||
- (zio->io_flags & ZIO_FLAG_RESILVER)) {
- n = raidz_parity_verify(zio, rm);
- unexpected_errors += n;
- ASSERT(parity_errors + n <=
- rm->rm_firstdatacol);
+ zio->io_error = SET_ERROR(ECKSUM);
+
+ if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
+ for (int i = 0; i < rm->rm_nrows; i++) {
+ raidz_row_t *rr = rm->rm_row[i];
+ for (int c = 0; c < rr->rr_cols; c++) {
+ raidz_col_t *rc = &rr->rr_col[c];
+ if (rc->rc_error == 0) {
+ zio_bad_cksum_t zbc;
+ zbc.zbc_has_cksum = 0;
+ zbc.zbc_injected =
+ rm->rm_ecksuminjected;
+
+ zfs_ereport_start_checksum(
+ zio->io_spa,
+ zio->io_vd->vdev_child[rc->rc_devidx],
+ zio, rc->rc_offset, rc->rc_size,
+ (void *)(uintptr_t)c, &zbc);
+ }
+ }
+ }
}
-
- goto done;
}
}
}
+ ASSERT(!vdrz->vn_expanding);
+}
+
+static void
+vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
+{
+ vdev_raidz_t *vdrz = vd->vdev_tsd;
+ if (faulted > vdrz->vd_nparity)
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_NO_REPLICAS);
+ else if (degraded + faulted != 0)
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
+ else
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
+}
+
+static void
+raidz_copy_range(void *arg, uint64_t start, uint64_t size)
+{
+ vdev_t *vd = arg;
+ int ashift = vd->vdev_top->vdev_ashift;
+ int old_children = vd->vdev_children - 1;
+ spa_t *spa = vd->vdev_spa;
+
+ ASSERT(IS_P2ALIGNED(start, 1 << ashift));
+ ASSERT(IS_P2ALIGNED(size, 1 << ashift));
+
+ abd_t *abd = abd_alloc_for_io(1 << ashift, B_FALSE);
+ for (uint64_t i = MAX(start >> ashift, old_children);
+ i < (start + size) >> ashift; i++) {
+ int child = i % old_children;
+ int offset = (i / old_children) << ashift;
+ spa_config_enter(spa, SCL_STATE, spa, RW_READER);
+ VERIFY0(zio_wait(zio_read_phys(NULL,
+ vd->vdev_child[child],
+ offset + VDEV_LABEL_START_SIZE,
+ 1 << ashift, abd,
+ ZIO_CHECKSUM_OFF, NULL, NULL,
+ ZIO_PRIORITY_REMOVAL, 0, B_FALSE)));
+
+ child = i % vd->vdev_children;
+ offset = (i / vd->vdev_children) << ashift;
+ VERIFY0(zio_wait(zio_write_phys(NULL,
+ vd->vdev_child[child],
+ offset + VDEV_LABEL_START_SIZE,
+ 1 << ashift, abd,
+ ZIO_CHECKSUM_OFF, NULL, NULL,
+ ZIO_PRIORITY_REMOVAL, 0, B_FALSE)));
+ spa_config_exit(spa, SCL_STATE, spa);
+ }
+ abd_free(abd);
+}
+
+void
+vdev_raidz_attach_sync(void *arg, dmu_tx_t *tx)
+{
+ vdev_t *new_child = arg;
+ spa_t *spa = new_child->vdev_spa;
+ vdev_t *raidvd = new_child->vdev_parent;
+ vdev_raidz_t *vdrz = raidvd->vdev_tsd;
+ ASSERT3P(raidvd->vdev_ops, ==, &vdev_raidz_ops);
+ ASSERT3P(raidvd->vdev_top, ==, raidvd);
+ ASSERT3U(raidvd->vdev_children, >, vdrz->vd_logical_width);
+ ASSERT3U(raidvd->vdev_children, ==, vdrz->vd_physical_width + 1);
/*
- * This isn't a typical situation -- either we got a read error or
- * a child silently returned bad data. Read every block so we can
- * try again with as much data and parity as we can track down. If
- * we've already been through once before, all children will be marked
- * as tried so we'll proceed to combinatorial reconstruction.
+ * XXX assuming that no other i/o takes place while this is happening,
+ * until we increment physical_width. But ZIL could do i/o.
*/
- unexpected_errors = 1;
- rm->rm_missingdata = 0;
- rm->rm_missingparity = 0;
+ vdrz->vn_expanding = B_TRUE;
- for (c = 0; c < rm->rm_cols; c++) {
- if (rm->rm_col[c].rc_tried)
- continue;
+ /*spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);*/
- zio_vdev_io_redone(zio);
- do {
- rc = &rm->rm_col[c];
- if (rc->rc_tried)
- continue;
- zio_nowait(zio_vdev_child_io(zio, NULL,
- vd->vdev_child[rc->rc_devidx],
- rc->rc_offset, rc->rc_abd, rc->rc_size,
- zio->io_type, zio->io_priority, 0,
- vdev_raidz_child_done, rc));
- } while (++c < rm->rm_cols);
+ range_tree_t *rt = range_tree_create(NULL, NULL);
- return;
+ for (uint64_t i = 0; i < raidvd->vdev_ms_count; i++) {
+ metaslab_t *msp = raidvd->vdev_ms[i];
+
+ /*vdev_initialize_ms_mark(msp);*/
+ mutex_enter(&msp->ms_lock);
+
+ metaslab_load_wait(msp);
+ if (!msp->ms_loaded)
+ VERIFY0(metaslab_load(msp));
+
+ /*
+ * We want to copy everything except the free (allocatable)
+ * space. Note that there may be a little bit more free
+ * space (e.g. in ms_defer), and it's fine to copy that too.
+ */
+ ASSERT(range_tree_is_empty(rt));
+ range_tree_add(rt, msp->ms_start, msp->ms_size);
+ range_tree_walk(msp->ms_allocatable, range_tree_remove, rt);
+ mutex_exit(&msp->ms_lock);
+
+ /*spa_config_exit(spa, SCL_CONFIG, FTAG);*/
+ /* Note, _vacate() doesn't visit in order */
+ range_tree_walk(rt, raidz_copy_range, raidvd);
+ range_tree_vacate(rt, NULL, NULL);
+ /*vdev_initialize_ms_unmark(msp);*/
+ /*spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);*/
}
+ /*spa_config_exit(spa, SCL_CONFIG, FTAG);*/
+ range_tree_destroy(rt);
+
+ vdrz->vd_physical_width++;
+
+#if 0
+ raidvd->vdev_expanding = B_TRUE;
+ vdev_reopen(raidvd);
+ raidvd->vdev_expanding = B_FALSE;
+#endif
+
+ vdrz->vn_expanding = B_FALSE;
+ /* Ensure that widths get written to label config */
+ vdev_config_dirty(raidvd);
+}
+
+/*
+ * Add RAIDZ-specific fields to the config nvlist.
+ * XXX add this to vdev_ops_t?
+ */
+void
+vdev_raidz_config_generate(vdev_t *vd, nvlist_t *nv)
+{
+ spa_t *spa = vd->vdev_spa;
+ ASSERT3P(vd->vdev_ops, ==, &vdev_raidz_ops);
+ vdev_raidz_t *vdrz = vd->vdev_tsd;
+
+ /*
+ * Make sure someone hasn't managed to sneak a fancy new vdev
+ * into a crufty old storage pool.
+ */
+ ASSERT(vdrz->vd_nparity == 1 ||
+ (vdrz->vd_nparity <= 2 &&
+ spa_version(spa) >= SPA_VERSION_RAIDZ2) ||
+ (vdrz->vd_nparity <= 3 &&
+ spa_version(spa) >= SPA_VERSION_RAIDZ3));
+
/*
- * At this point we've attempted to reconstruct the data given the
- * errors we detected, and we've attempted to read all columns. There
- * must, therefore, be one or more additional problems -- silent errors
- * resulting in invalid data rather than explicit I/O errors resulting
- * in absent data. We check if there is enough additional data to
- * possibly reconstruct the data and then perform combinatorial
- * reconstruction over all possible combinations. If that fails,
- * we're cooked.
+ * Note that we'll add these even on storage pools where they
+ * aren't strictly required -- older software will just ignore
+ * it.
*/
- if (total_errors > rm->rm_firstdatacol) {
- zio->io_error = vdev_raidz_worst_error(rm);
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vdrz->vd_nparity);
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_RAIDZ_LOGICAL_WIDTH,
+ vdrz->vd_logical_width);
+}
+
+/*
+ * Set RAIDZ-specific fields in the vdev_t, based on the config.
+ * Can't assume that anything about the vdev_t is already set.
+ * XXX add this to vdev_ops_t?
+ */
+void *
+vdev_raidz_get_tsd(spa_t *spa, nvlist_t *nv)
+{
+ uint64_t nparity, lw;
+ vdev_raidz_t *vdrz = kmem_zalloc(sizeof (*vdrz), KM_SLEEP);
+
+ uint_t children;
+ nvlist_t **child;
+ int error = nvlist_lookup_nvlist_array(nv,
+ ZPOOL_CONFIG_CHILDREN, &child, &children);
+ if (error != 0)
+ goto out;
+
+ vdrz->vd_logical_width = children;
+ vdrz->vd_physical_width = children;
+
+ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RAIDZ_LOGICAL_WIDTH,
+ &lw) == 0) {
+ vdrz->vd_logical_width = lw;
+ }
- } else if (total_errors < rm->rm_firstdatacol &&
- (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
+ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY,
+ &nparity) == 0) {
+ if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY)
+ goto out;
/*
- * If we didn't use all the available parity for the
- * combinatorial reconstruction, verify that the remaining
- * parity is correct.
+ * Previous versions could only support 1 or 2 parity
+ * device.
*/
- if (code != (1 << rm->rm_firstdatacol) - 1)
- (void) raidz_parity_verify(zio, rm);
+ if (nparity > 1 &&
+ spa_version(spa) < SPA_VERSION_RAIDZ2)
+ goto out;
+ if (nparity > 2 &&
+ spa_version(spa) < SPA_VERSION_RAIDZ3)
+ goto out;
} else {
/*
- * We're here because either:
- *
- * total_errors == rm_first_datacol, or
- * vdev_raidz_combrec() failed
- *
- * In either case, there is enough bad data to prevent
- * reconstruction.
- *
- * Start checksum ereports for all children which haven't
- * failed, and the IO wasn't speculative.
+ * We require the parity to be specified for SPAs that
+ * support multiple parity levels.
*/
- zio->io_error = SET_ERROR(ECKSUM);
-
- if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
- for (c = 0; c < rm->rm_cols; c++) {
- rc = &rm->rm_col[c];
- if (rc->rc_error == 0) {
- zio_bad_cksum_t zbc;
- zbc.zbc_has_cksum = 0;
- zbc.zbc_injected =
- rm->rm_ecksuminjected;
-
- zfs_ereport_start_checksum(
- zio->io_spa,
- vd->vdev_child[rc->rc_devidx],
- zio, rc->rc_offset, rc->rc_size,
- (void *)(uintptr_t)c, &zbc);
- }
- }
- }
- }
-
-done:
- zio_checksum_verified(zio);
-
- if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
- (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
+ if (spa_version(spa) >= SPA_VERSION_RAIDZ2)
+ goto out;
/*
- * Use the good data we have in hand to repair damaged children.
+ * Otherwise, we default to 1 parity device for RAID-Z.
*/
- for (c = 0; c < rm->rm_cols; c++) {
- rc = &rm->rm_col[c];
- cvd = vd->vdev_child[rc->rc_devidx];
-
- if (rc->rc_error == 0)
- continue;
-
- zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
- rc->rc_offset, rc->rc_abd, rc->rc_size,
- ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
- ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
- ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
- }
+ nparity = 1;
}
-}
-
-static void
-vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
-{
- if (faulted > vd->vdev_nparity)
- vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
- VDEV_AUX_NO_REPLICAS);
- else if (degraded + faulted != 0)
- vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
- else
- vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
+ vdrz->vd_nparity = nparity;
+ return (vdrz);
+out:
+ kmem_free(vdrz, sizeof (*vdrz));
+ return (NULL);
}
vdev_ops_t vdev_raidz_ops = {
Index: sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h
+++ sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h
@@ -556,6 +556,7 @@
#define ZPOOL_CONFIG_SPARES "spares"
#define ZPOOL_CONFIG_IS_SPARE "is_spare"
#define ZPOOL_CONFIG_NPARITY "nparity"
+#define ZPOOL_CONFIG_RAIDZ_LOGICAL_WIDTH "raidz_logical_width"
#define ZPOOL_CONFIG_HOSTID "hostid"
#define ZPOOL_CONFIG_HOSTNAME "hostname"
#define ZPOOL_CONFIG_LOADED_TIME "initial_load_time"
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Mon, Apr 20, 11:02 AM (2 h, 54 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
31834715
Default Alt Text
D15124.diff (103 KB)
Attached To
Mode
D15124: raidz expansion PRE-ALPHA CODE
Attached
Detach File
Event Timeline
Log In to Comment