D15124.diff
No OneTemporary
Actions

Size

103 KB

Referenced Files

None

Subscribers

None

D15124.diff
View Options

	Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/abd.c
	===================================================================
	--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/abd.c
	+++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/abd.c
	@@ -758,6 +758,9 @@
	int ret = 0;
	struct abd_iter aiter;

	+ if (size == 0)
	+ return (ret);
	+
	abd_verify(abd);
	ASSERT3U(off + size, <=, abd->abd_size);

	@@ -886,6 +889,9 @@
	int ret = 0;
	struct abd_iter daiter, saiter;

	+ if (size == 0)
	+ return (ret);
	+
	abd_verify(dabd);
	abd_verify(sabd);

	Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c
	===================================================================
	--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c
	+++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c
	@@ -91,7 +91,7 @@
	{
	reference_t *ref;

	- ASSERT(rc->rc_count == number);
	+ ASSERT3U(rc->rc_count, ==, number);
	while (ref = list_head(&rc->rc_list)) {
	list_remove(&rc->rc_list, ref);
	kmem_cache_free(reference_cache, ref);
	Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
	===================================================================
	--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
	+++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
	@@ -55,6 +55,7 @@
	#include <sys/vdev_removal.h>
	#include <sys/vdev_indirect_mapping.h>
	#include <sys/vdev_indirect_births.h>
	+#include <sys/vdev_raidz.h>
	#include <sys/metaslab.h>
	#include <sys/metaslab_impl.h>
	#include <sys/uberblock_impl.h>
	@@ -5923,8 +5924,9 @@
	vdev_t oldvd, newvd, newrootvd, pvd, *tvd;
	vdev_ops_t *pvops;
	char oldvdpath, newvdpath;
	- int newvd_isspare;
	+ int newvd_isspare = B_FALSE;
	int error;
	+ boolean_t raidz = B_FALSE;

	ASSERT(spa_writeable(spa));

	@@ -5947,10 +5949,16 @@
	if (oldvd == NULL)
	return (spa_vdev_exit(spa, NULL, txg, ENODEV));

	- if (!oldvd->vdev_ops->vdev_op_leaf)
	+ if (oldvd->vdev_ops == &vdev_raidz_ops) {
	+ raidz = B_TRUE;
	+ } else if (!oldvd->vdev_ops->vdev_op_leaf) {
	return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
	+ }

	- pvd = oldvd->vdev_parent;
	+ if (raidz)
	+ pvd = oldvd;
	+ else
	+ pvd = oldvd->vdev_parent;

	if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0,
	VDEV_ALLOC_ATTACH)) != 0)
	@@ -5979,6 +5987,7 @@
	* vdev.
	*/
	if (pvd->vdev_ops != &vdev_mirror_ops &&
	+ pvd->vdev_ops != &vdev_raidz_ops &&
	pvd->vdev_ops != &vdev_root_ops)
	return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));

	@@ -6018,7 +6027,8 @@
	/*
	* Make sure the new device is big enough.
	*/
	- if (newvd->vdev_asize < vdev_get_min_asize(oldvd))
	+ vdev_t *min_vdev = raidz ? oldvd->vdev_child[0] : oldvd;
	+ if (newvd->vdev_asize < vdev_get_min_asize(min_vdev))
	return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW));

	/*
	@@ -6028,35 +6038,48 @@
	if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift)
	return (spa_vdev_exit(spa, newrootvd, txg, EDOM));

	+ if (raidz) {
	+ oldvdpath = kmem_asprintf("raidz%u-%u",
	+ oldvd->vdev_nparity, oldvd->vdev_id);
	+ } else {
	+ oldvdpath = spa_strdup(oldvd->vdev_path);
	+ }
	+ newvdpath = spa_strdup(newvd->vdev_path);
	+
	/*
	* If this is an in-place replacement, update oldvd's path and devid
	* to make it distinguishable from newvd, and unopenable from now on.
	*/
	- if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) {
	+ if (strcmp(oldvdpath, newvdpath) == 0) {
	spa_strfree(oldvd->vdev_path);
	- oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5,
	+ oldvd->vdev_path = kmem_alloc(strlen(newvdpath) + 5,
	KM_SLEEP);
	- (void) sprintf(oldvd->vdev_path, "%s/%s",
	- newvd->vdev_path, "old");
	+ (void) sprintf(oldvd->vdev_path, "%s/old",
	+ newvdpath);
	if (oldvd->vdev_devid != NULL) {
	spa_strfree(oldvd->vdev_devid);
	oldvd->vdev_devid = NULL;
	}
	+ spa_strfree(oldvdpath);
	+ oldvdpath = spa_strdup(oldvd->vdev_path);
	}

	/* mark the device being resilvered */
	- newvd->vdev_resilver_txg = txg;
	+ if (!raidz)
	+ newvd->vdev_resilver_txg = txg;

	/*
	* If the parent is not a mirror, or if we're replacing, insert the new
	* mirror/replacing/spare vdev above oldvd.
	*/
	- if (pvd->vdev_ops != pvops)
	+ if (!raidz && pvd->vdev_ops != pvops)
	pvd = vdev_add_parent(oldvd, pvops);

	ASSERT(pvd->vdev_top->vdev_parent == rvd);
	+#if 0
	ASSERT(pvd->vdev_ops == pvops);
	ASSERT(oldvd->vdev_parent == pvd);
	+#endif

	/*
	* Extract the new device from its root and add it to pvd.
	@@ -6079,29 +6102,34 @@
	*/
	dtl_max_txg = txg + TXG_CONCURRENT_STATES;

	- vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL,
	- dtl_max_txg - TXG_INITIAL);
	+ if (raidz) {
	+ dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
	+ dsl_sync_task_nowait(spa->spa_dsl_pool, vdev_raidz_attach_sync,
	+ newvd, 0, ZFS_SPACE_CHECK_EXTRA_RESERVED, tx);
	+ dmu_tx_commit(tx);
	+ } else {
	+ vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL,
	+ dtl_max_txg - TXG_INITIAL);

	- if (newvd->vdev_isspare) {
	- spa_spare_activate(newvd);
	- spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_SPARE);
	- }
	+ if (newvd->vdev_isspare) {
	+ spa_spare_activate(newvd);
	+ spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_SPARE);
	+ }

	- oldvdpath = spa_strdup(oldvd->vdev_path);
	- newvdpath = spa_strdup(newvd->vdev_path);
	- newvd_isspare = newvd->vdev_isspare;
	+ newvd_isspare = newvd->vdev_isspare;

	- /*
	- * Mark newvd's DTL dirty in this txg.
	- */
	- vdev_dirty(tvd, VDD_DTL, newvd, txg);
	+ /*
	+ * Mark newvd's DTL dirty in this txg.
	+ */
	+ vdev_dirty(tvd, VDD_DTL, newvd, txg);

	- /*
	- * Schedule the resilver to restart in the future. We do this to
	- * ensure that dmu_sync-ed blocks have been stitched into the
	- * respective datasets.
	- */
	- dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg);
	+ /*
	+ * Schedule the resilver to restart in the future. We do this to
	+ * ensure that dmu_sync-ed blocks have been stitched into the
	+ * respective datasets.
	+ */
	+ dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg);
	+ }

	if (spa->spa_bootfs)
	spa_event_notify(spa, newvd, NULL, ESC_ZFS_BOOTFS_VDEV_ATTACH);
	@@ -6113,6 +6141,10 @@
	*/
	(void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0);

	+ if (raidz) {
	+ error = vdev_online(spa, guid, ZFS_ONLINE_EXPAND, NULL);
	+ }
	+
	spa_history_log_internal(spa, "vdev attach", NULL,
	"%s vdev=%s %s vdev=%s",
	replacing && newvd_isspare ? "spare in" :
	Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_raidz.h
	===================================================================
	--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_raidz.h
	+++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_raidz.h
	@@ -39,10 +39,18 @@
	extern "C" {
	#endif

	-#ifdef _KERNEL
	+typedef struct vdev_raidz {
	+ int vd_logical_width;
	+ int vd_physical_width;
	+ int vd_nparity;
	+ boolean_t vn_expanding;
	+} vdev_raidz_t;
	+
	extern int vdev_raidz_physio(vdev_t *,
	caddr_t, size_t, uint64_t, uint64_t, boolean_t, boolean_t);
	-#endif
	+extern void vdev_raidz_attach_sync(void , dmu_tx_t );
	+extern void vdev_raidz_config_generate(vdev_t , nvlist_t );
	+extern void vdev_raidz_get_tsd(spa_t , nvlist_t *);
	#ifdef __cplusplus
	}
	#endif
	Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
	===================================================================
	--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
	+++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
	@@ -49,6 +49,7 @@
	#include <sys/arc.h>
	#include <sys/zil.h>
	#include <sys/dsl_scan.h>
	+#include <sys/vdev_raidz.h>
	#include <sys/abd.h>
	#include <sys/trim_map.h>

	@@ -584,7 +585,7 @@
	{
	vdev_ops_t *ops;
	char *type;
	- uint64_t guid = 0, islog, nparity;
	+ uint64_t guid = 0, islog;
	vdev_t *vd;
	vdev_indirect_config_t *vic;

	@@ -637,47 +638,21 @@
	if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES)
	return (SET_ERROR(ENOTSUP));

	- /*
	- * Set the nparity property for RAID-Z vdevs.
	- */
	- nparity = -1ULL;
	+ void *tsd = NULL;
	+ int nparity = 0;
	if (ops == &vdev_raidz_ops) {
	- if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY,
	- &nparity) == 0) {
	- if (nparity == 0 \|\| nparity > VDEV_RAIDZ_MAXPARITY)
	- return (SET_ERROR(EINVAL));
	- /*
	- * Previous versions could only support 1 or 2 parity
	- * device.
	- */
	- if (nparity > 1 &&
	- spa_version(spa) < SPA_VERSION_RAIDZ2)
	- return (SET_ERROR(ENOTSUP));
	- if (nparity > 2 &&
	- spa_version(spa) < SPA_VERSION_RAIDZ3)
	- return (SET_ERROR(ENOTSUP));
	- } else {
	- /*
	- * We require the parity to be specified for SPAs that
	- * support multiple parity levels.
	- */
	- if (spa_version(spa) >= SPA_VERSION_RAIDZ2)
	- return (SET_ERROR(EINVAL));
	- /*
	- * Otherwise, we default to 1 parity device for RAID-Z.
	- */
	- nparity = 1;
	- }
	- } else {
	- nparity = 0;
	+ vdev_raidz_t *rz = tsd = vdev_raidz_get_tsd(spa, nv);
	+ if (rz == NULL)
	+ return (SET_ERROR(EINVAL));
	+ nparity = rz->vd_nparity;
	}
	- ASSERT(nparity != -1ULL);

	vd = vdev_alloc_common(spa, id, guid, ops);
	vic = &vd->vdev_indirect_config;

	vd->vdev_islog = islog;
	vd->vdev_nparity = nparity;
	+ vd->vdev_tsd = tsd;

	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0)
	vd->vdev_path = spa_strdup(vd->vdev_path);
	@@ -849,6 +824,11 @@
	ASSERT(vd->vdev_child == NULL);
	ASSERT(vd->vdev_guid_sum == vd->vdev_guid);

	+ if (vd->vdev_ops == &vdev_raidz_ops) {
	+ vdev_raidz_t *rz = vd->vdev_tsd;
	+ kmem_free(rz, sizeof(*rz));
	+ }
	+
	/*
	* Discard allocation state.
	*/
	@@ -3155,8 +3135,10 @@
	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
	return (spa_vdev_state_exit(spa, NULL, ENODEV));

	+#if 0
	if (!vd->vdev_ops->vdev_op_leaf)
	return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
	+#endif

	wasoffline = (vd->vdev_offline \|\| vd->vdev_tmpoffline);
	oldstate = vd->vdev_state;
	Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect.c
	===================================================================
	--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect.c
	+++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect.c
	@@ -1078,6 +1078,13 @@
	if (vd->vdev_ops == &vdev_indirect_ops)
	return;

	+ printf("vdev_indirect_io_start_cb: src=%llx split_offset=%x dst: vd=%u off=%llx size=%x\n",
	+ (long long)zio->io_offset,
	+ (int)split_offset,
	+ (int)vd->vdev_id,
	+ (long long)offset,
	+ (int)size);
	+
	zio_nowait(zio_vdev_child_io(zio, NULL, vd, offset,
	abd_get_offset(zio->io_abd, split_offset),
	size, zio->io_type, zio->io_priority,
	Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c
	===================================================================
	--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c
	+++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c
	@@ -141,6 +141,7 @@
	#include <sys/zap.h>
	#include <sys/vdev.h>
	#include <sys/vdev_impl.h>
	+#include <sys/vdev_raidz.h>
	#include <sys/uberblock_impl.h>
	#include <sys/metaslab.h>
	#include <sys/metaslab_impl.h>
	@@ -276,31 +277,13 @@
	if (vd->vdev_fru != NULL)
	fnvlist_add_string(nv, ZPOOL_CONFIG_FRU, vd->vdev_fru);

	- if (vd->vdev_nparity != 0) {
	- ASSERT(strcmp(vd->vdev_ops->vdev_op_type,
	- VDEV_TYPE_RAIDZ) == 0);
	+ if (vd->vdev_ops == &vdev_raidz_ops)
	+ vdev_raidz_config_generate(vd, nv);

	- /*
	- * Make sure someone hasn't managed to sneak a fancy new vdev
	- * into a crufty old storage pool.
	- */
	- ASSERT(vd->vdev_nparity == 1 \|\|
	- (vd->vdev_nparity <= 2 &&
	- spa_version(spa) >= SPA_VERSION_RAIDZ2) \|\|
	- (vd->vdev_nparity <= 3 &&
	- spa_version(spa) >= SPA_VERSION_RAIDZ3));
	-
	- /*
	- * Note that we'll add the nparity tag even on storage pools
	- * that only support a single parity device -- older software
	- * will just ignore it.
	- */
	- fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vd->vdev_nparity);
	- }
	-
	- if (vd->vdev_wholedisk != -1ULL)
	+ if (vd->vdev_wholedisk != -1ULL) {
	fnvlist_add_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
	vd->vdev_wholedisk);
	+ }

	if (vd->vdev_not_present && !(flags & VDEV_CONFIG_MISSING))
	fnvlist_add_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 1);
	Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c
	===================================================================
	--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c
	+++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c
	@@ -28,12 +28,14 @@

	#include <sys/zfs_context.h>
	#include <sys/spa.h>
	+#include <sys/zap.h>
	#include <sys/vdev_impl.h>
	#ifdef illumos
	#include <sys/vdev_disk.h>
	#endif
	#include <sys/vdev_file.h>
	#include <sys/vdev_raidz.h>
	+#include <sys/metaslab_impl.h>
	#include <sys/zio.h>
	#include <sys/zio_checksum.h>
	#include <sys/abd.h>
	@@ -41,6 +43,12 @@
	#include <sys/fm/fs/zfs.h>
	#include <sys/bio.h>

	+#if 0
	+#ifdef ZFS_DEBUG
	+#include <sys/vdev_initialize.h> /* vdev_xlate testing */
	+#endif
	+#endif
	+
	/*
	* Virtual device vector for RAID-Z.
	*
	@@ -113,27 +121,31 @@
	uint64_t rc_offset; /* device offset */
	uint64_t rc_size; /* I/O size */
	abd_t rc_abd; / I/O data */
	+ void rc_orig_data; / pre-reconstruction */
	void rc_gdata; / used to store the "good" version */
	int rc_error; /* I/O error for this device */
	uint8_t rc_tried; /* Did we attempt this I/O column? */
	uint8_t rc_skipped; /* Did we skip this I/O column? */
	+ uint8_t rc_need_orig_restore; /* need to restore from orig_data? */
	} raidz_col_t;

	+typedef struct raidz_row {
	+ uint64_t rr_cols; /* Regular column count */
	+ uint64_t rr_missingdata; /* Count of missing data devices */
	+ uint64_t rr_missingparity; /* Count of missing parity devices */
	+ uint64_t rr_firstdatacol; /* First data column/parity count */
	+ abd_t rr_abd_copy; / rm_asize-buffer of copied data */
	+ int rr_code; /* reconstruction code */
	+ raidz_col_t rr_col[0]; /* Flexible array of I/O columns */
	+} raidz_row_t;
	+
	typedef struct raidz_map {
	- uint64_t rm_cols; /* Regular column count */
	- uint64_t rm_scols; /* Count including skipped columns */
	- uint64_t rm_bigcols; /* Number of oversized columns */
	- uint64_t rm_asize; /* Actual total I/O size */
	- uint64_t rm_missingdata; /* Count of missing data devices */
	- uint64_t rm_missingparity; /* Count of missing parity devices */
	- uint64_t rm_firstdatacol; /* First data column/parity count */
	- uint64_t rm_nskip; /* Skipped sectors for padding */
	- uint64_t rm_skipstart; /* Column index of padding start */
	- abd_t rm_abd_copy; / rm_asize-buffer of copied data */
	uintptr_t rm_reports; /* # of referencing checksum reports */
	- uint8_t rm_freed; /* map no longer has referencing ZIO */
	- uint8_t rm_ecksuminjected; /* checksum error was injected */
	- raidz_col_t rm_col[1]; /* Flexible array of I/O columns */
	+ boolean_t rm_freed; /* map no longer has referencing ZIO */
	+ boolean_t rm_ecksuminjected; /* checksum error was injected */
	+ int rm_nrows;
	+ int rm_nskip; /* Sectors skipped for padding */
	+ raidz_row_t rm_row[0]; / flexible array of rows */
	} raidz_map_t;

	#define VDEV_RAIDZ_P 0
	@@ -241,7 +253,7 @@
	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
	};

	-static void vdev_raidz_generate_parity(raidz_map_t *rm);
	+static void vdev_raidz_generate_parity(raidz_row_t *);

	/*
	* Multiply a given number by 2 raised to the given power.
	@@ -263,31 +275,46 @@
	}

	static void
	-vdev_raidz_map_free(raidz_map_t *rm)
	+vdev_raidz_row_free(raidz_row_t *rr)
	{
	int c;
	- size_t size;

	- for (c = 0; c < rm->rm_firstdatacol; c++) {
	- if (rm->rm_col[c].rc_abd != NULL)
	- abd_free(rm->rm_col[c].rc_abd);
	+ for (c = 0; c < rr->rr_firstdatacol && c < rr->rr_cols; c++) {
	+ if (rr->rr_col[c].rc_abd != NULL)
	+ abd_free(rr->rr_col[c].rc_abd);

	- if (rm->rm_col[c].rc_gdata != NULL)
	- zio_buf_free(rm->rm_col[c].rc_gdata,
	- rm->rm_col[c].rc_size);
	+ if (rr->rr_col[c].rc_gdata != NULL) {
	+ zio_buf_free(rr->rr_col[c].rc_gdata,
	+ rr->rr_col[c].rc_size);
	+ }
	+ if (rr->rr_col[c].rc_orig_data != NULL) {
	+ zio_buf_free(rr->rr_col[c].rc_orig_data,
	+ rr->rr_col[c].rc_size);
	+ }
	}

	- size = 0;
	- for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
	- if (rm->rm_col[c].rc_abd != NULL)
	- abd_put(rm->rm_col[c].rc_abd);
	- size += rm->rm_col[c].rc_size;
	+ for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
	+ if (rr->rr_col[c].rc_abd != NULL)
	+ abd_put(rr->rr_col[c].rc_abd);
	+ if (rr->rr_col[c].rc_orig_data != NULL) {
	+ zio_buf_free(rr->rr_col[c].rc_orig_data,
	+ rr->rr_col[c].rc_size);
	+ }
	}

	- if (rm->rm_abd_copy != NULL)
	- abd_free(rm->rm_abd_copy);
	+ if (rr->rr_abd_copy != NULL)
	+ abd_free(rr->rr_abd_copy);

	- kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
	+ kmem_free(rr, offsetof(raidz_row_t, rr_col[rr->rr_cols]));
	+}
	+
	+static void
	+vdev_raidz_map_free(raidz_map_t *rm)
	+{
	+ for (int i = 0; i < rm->rm_nrows; i++) {
	+ vdev_raidz_row_free(rm->rm_row[i]);
	+ }
	+ kmem_free(rm, offsetof(raidz_map_t, rm_row[rm->rm_nrows]));
	}

	static void
	@@ -296,10 +323,11 @@
	raidz_map_t *rm = zio->io_vsd;

	ASSERT0(rm->rm_freed);
	- rm->rm_freed = 1;
	+ rm->rm_freed = B_TRUE;

	- if (rm->rm_reports == 0)
	+ if (rm->rm_reports == 0) {
	vdev_raidz_map_free(rm);
	+ }
	}

	/ARGSUSED/
	@@ -310,7 +338,7 @@

	ASSERT3U(rm->rm_reports, >, 0);

	- if (--rm->rm_reports == 0 && rm->rm_freed != 0)
	+ if (--rm->rm_reports == 0 && rm->rm_freed)
	vdev_raidz_map_free(rm);
	}

	@@ -324,18 +352,22 @@
	const char *good = NULL;
	char *bad;

	+ zfs_dbgmsg("checksum error on rm=%p", rm);
	+
	if (good_data == NULL) {
	zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
	return;
	}

	- if (c < rm->rm_firstdatacol) {
	+ zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
	+#if 0
	+ if (c < rm->rr_firstdatacol) {
	/*
	* The first time through, calculate the parity blocks for
	* the good data (this relies on the fact that the good
	* data never changes for a given logical ZIO)
	*/
	- if (rm->rm_col[0].rc_gdata == NULL) {
	+ if (rm->rr_col[0].rc_gdata == NULL) {
	abd_t *bad_parity[VDEV_RAIDZ_MAXPARITY];
	char *buf;
	int offset;
	@@ -345,22 +377,22 @@
	* good_data, first saving the parity bufs and
	* replacing them with buffers to hold the result.
	*/
	- for (x = 0; x < rm->rm_firstdatacol; x++) {
	- bad_parity[x] = rm->rm_col[x].rc_abd;
	- rm->rm_col[x].rc_gdata =
	- zio_buf_alloc(rm->rm_col[x].rc_size);
	- rm->rm_col[x].rc_abd =
	- abd_get_from_buf(rm->rm_col[x].rc_gdata,
	- rm->rm_col[x].rc_size);
	+ for (x = 0; x < rm->rr_firstdatacol; x++) {
	+ bad_parity[x] = rm->rr_col[x].rc_abd;
	+ rm->rr_col[x].rc_gdata =
	+ zio_buf_alloc(rm->rr_col[x].rc_size);
	+ rm->rr_col[x].rc_abd =
	+ abd_get_from_buf(rm->rr_col[x].rc_gdata,
	+ rm->rr_col[x].rc_size);
	}

	/* fill in the data columns from good_data */
	buf = (char *)good_data;
	- for (; x < rm->rm_cols; x++) {
	- abd_put(rm->rm_col[x].rc_abd);
	- rm->rm_col[x].rc_abd = abd_get_from_buf(buf,
	- rm->rm_col[x].rc_size);
	- buf += rm->rm_col[x].rc_size;
	+ for (; x < rm->rr_cols; x++) {
	+ abd_put(rm->rr_col[x].rc_abd);
	+ rm->rr_col[x].rc_abd = abd_get_from_buf(buf,
	+ rm->rr_col[x].rc_size);
	+ buf += rm->rr_col[x].rc_size;
	}

	/*
	@@ -369,34 +401,35 @@
	vdev_raidz_generate_parity(rm);

	/* restore everything back to its original state */
	- for (x = 0; x < rm->rm_firstdatacol; x++) {
	- abd_put(rm->rm_col[x].rc_abd);
	- rm->rm_col[x].rc_abd = bad_parity[x];
	+ for (x = 0; x < rm->rr_firstdatacol; x++) {
	+ abd_put(rm->rr_col[x].rc_abd);
	+ rm->rr_col[x].rc_abd = bad_parity[x];
	}

	offset = 0;
	- for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
	- abd_put(rm->rm_col[x].rc_abd);
	- rm->rm_col[x].rc_abd = abd_get_offset(
	- rm->rm_abd_copy, offset);
	- offset += rm->rm_col[x].rc_size;
	+ for (x = rm->rr_firstdatacol; x < rm->rr_cols; x++) {
	+ abd_put(rm->rr_col[x].rc_abd);
	+ rm->rr_col[x].rc_abd = abd_get_offset(
	+ rm->rr_abd_copy, offset);
	+ offset += rm->rr_col[x].rc_size;
	}
	}

	- ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
	- good = rm->rm_col[c].rc_gdata;
	+ ASSERT3P(rm->rr_col[c].rc_gdata, !=, NULL);
	+ good = rm->rr_col[c].rc_gdata;
	} else {
	/* adjust good_data to point at the start of our column */
	good = good_data;

	- for (x = rm->rm_firstdatacol; x < c; x++)
	- good += rm->rm_col[x].rc_size;
	+ for (x = rm->rr_firstdatacol; x < c; x++)
	+ good += rm->rr_col[x].rc_size;
	}

	- bad = abd_borrow_buf_copy(rm->rm_col[c].rc_abd, rm->rm_col[c].rc_size);
	+ bad = abd_borrow_buf_copy(rm->rr_col[c].rc_abd, rm->rr_col[c].rc_size);
	/* we drop the ereport if it ends up that the data was good */
	zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
	- abd_return_buf(rm->rm_col[c].rc_abd, bad, rm->rm_col[c].rc_size);
	+ abd_return_buf(rm->rr_col[c].rc_abd, bad, rm->rr_col[c].rc_size);
	+#endif
	}

	/*
	@@ -409,10 +442,7 @@
	vdev_raidz_cksum_report(zio_t zio, zio_cksum_report_t zcr, void *arg)
	{
	size_t c = (size_t)(uintptr_t)arg;
	- size_t offset;
	-
	raidz_map_t *rm = zio->io_vsd;
	- size_t size;

	/* set up the report and bump the refcount */
	zcr->zcr_cbdata = rm;
	@@ -423,7 +453,7 @@
	rm->rm_reports++;
	ASSERT3U(rm->rm_reports, >, 0);

	- if (rm->rm_abd_copy != NULL)
	+ if (rm->rm_row[0]->rr_abd_copy != NULL)
	return;

	/*
	@@ -435,24 +465,33 @@
	* to copy them.
	*/

	- size = 0;
	- for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
	- size += rm->rm_col[c].rc_size;
	+ for (int i = 0; i < rm->rm_nrows; i++) {
	+ raidz_row_t *rr = rm->rm_row[i];
	+ size_t offset;
	+ size_t size = 0;
	+ for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++)
	+ size += rr->rr_col[c].rc_size;
	+
	+ rr->rr_abd_copy =
	+ abd_alloc_sametype(rr->rr_col[rr->rr_firstdatacol].rc_abd,
	+ size);

	- rm->rm_abd_copy =
	- abd_alloc_sametype(rm->rm_col[rm->rm_firstdatacol].rc_abd, size);
	+ for (offset = 0, c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
	+ raidz_col_t *col = &rr->rr_col[c];
	+
	+ if (col->rc_size == 0)
	+ continue;

	- for (offset = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
	- raidz_col_t *col = &rm->rm_col[c];
	- abd_t *tmp = abd_get_offset(rm->rm_abd_copy, offset);
	+ abd_t *tmp = abd_get_offset(rr->rr_abd_copy, offset);

	- abd_copy(tmp, col->rc_abd, col->rc_size);
	- abd_put(col->rc_abd);
	- col->rc_abd = tmp;
	+ abd_copy(tmp, col->rc_abd, col->rc_size);
	+ abd_put(col->rc_abd);
	+ col->rc_abd = tmp;

	- offset += col->rc_size;
	+ offset += col->rc_size;
	+ }
	+ ASSERT3U(offset, ==, size);
	}
	- ASSERT3U(offset, ==, size);
	}

	static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
	@@ -468,7 +507,7 @@
	vdev_raidz_map_alloc(abd_t *abd, uint64_t size, uint64_t offset, boolean_t dofree,
	uint64_t unit_shift, uint64_t dcols, uint64_t nparity)
	{
	- raidz_map_t *rm;
	+ raidz_row_t *rr;
	/* The starting RAIDZ (parent) vdev sector of the block. */
	uint64_t b = offset >> unit_shift;
	/* The zio's size in units of the vdev's minimum sector size. */
	@@ -477,9 +516,13 @@
	uint64_t f = b % dcols;
	/* The starting byte offset on each child vdev. */
	uint64_t o = (b / dcols) << unit_shift;
	- uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
	+ uint64_t q, r, c, bc, col, acols, coff, devidx, asize, tot;
	uint64_t off = 0;

	+ raidz_map_t *rm =
	+ kmem_zalloc(offsetof(raidz_map_t, rm_row[1]), KM_SLEEP);
	+ rm->rm_nrows = 1;
	+
	/*
	* "Quotient": The number of data sectors for this stripe on all but
	* the "big column" child vdevs that also contain "remainder" data.
	@@ -502,77 +545,63 @@
	tot = s + nparity * (q + (r == 0 ? 0 : 1));

	/* acols: The columns that will be accessed. */
	- /* scols: The columns that will be accessed or skipped. */
	if (q == 0) {
	/* Our I/O request doesn't span all child vdevs. */
	acols = bc;
	- scols = MIN(dcols, roundup(bc, nparity + 1));
	} else {
	acols = dcols;
	- scols = dcols;
	}

	- ASSERT3U(acols, <=, scols);
	-
	- rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
	+ rr = kmem_alloc(offsetof(raidz_row_t, rr_col[acols]), KM_SLEEP);
	+ rm->rm_row[0] = rr;

	- rm->rm_cols = acols;
	- rm->rm_scols = scols;
	- rm->rm_bigcols = bc;
	- rm->rm_skipstart = bc;
	- rm->rm_missingdata = 0;
	- rm->rm_missingparity = 0;
	- rm->rm_firstdatacol = nparity;
	- rm->rm_abd_copy = NULL;
	- rm->rm_reports = 0;
	- rm->rm_freed = 0;
	- rm->rm_ecksuminjected = 0;
	+ rr->rr_cols = acols;
	+ rr->rr_missingdata = 0;
	+ rr->rr_missingparity = 0;
	+ rr->rr_firstdatacol = nparity;
	+ rr->rr_abd_copy = NULL;

	asize = 0;

	- for (c = 0; c < scols; c++) {
	+ for (c = 0; c < acols; c++) {
	col = f + c;
	coff = o;
	if (col >= dcols) {
	col -= dcols;
	coff += 1ULL << unit_shift;
	}
	- rm->rm_col[c].rc_devidx = col;
	- rm->rm_col[c].rc_offset = coff;
	- rm->rm_col[c].rc_abd = NULL;
	- rm->rm_col[c].rc_gdata = NULL;
	- rm->rm_col[c].rc_error = 0;
	- rm->rm_col[c].rc_tried = 0;
	- rm->rm_col[c].rc_skipped = 0;
	-
	- if (c >= acols)
	- rm->rm_col[c].rc_size = 0;
	- else if (c < bc)
	- rm->rm_col[c].rc_size = (q + 1) << unit_shift;
	+ rr->rr_col[c].rc_devidx = col;
	+ rr->rr_col[c].rc_offset = coff;
	+ rr->rr_col[c].rc_abd = NULL;
	+ rr->rr_col[c].rc_gdata = NULL;
	+ rr->rr_col[c].rc_orig_data = NULL;
	+ rr->rr_col[c].rc_error = 0;
	+ rr->rr_col[c].rc_tried = 0;
	+ rr->rr_col[c].rc_skipped = 0;
	+ rr->rr_col[c].rc_need_orig_restore = B_FALSE;
	+
	+ if (c < bc)
	+ rr->rr_col[c].rc_size = (q + 1) << unit_shift;
	else
	- rm->rm_col[c].rc_size = q << unit_shift;
	+ rr->rr_col[c].rc_size = q << unit_shift;

	- asize += rm->rm_col[c].rc_size;
	+ asize += rr->rr_col[c].rc_size;
	}

	ASSERT3U(asize, ==, tot << unit_shift);
	- rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
	- ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
	- ASSERT3U(rm->rm_nskip, <=, nparity);

	if (!dofree) {
	- for (c = 0; c < rm->rm_firstdatacol; c++) {
	- rm->rm_col[c].rc_abd =
	- abd_alloc_linear(rm->rm_col[c].rc_size, B_TRUE);
	- }
	+ for (c = 0; c < rr->rr_firstdatacol; c++)
	+ rr->rr_col[c].rc_abd =
	+ abd_alloc_linear(rr->rr_col[c].rc_size, B_TRUE);

	- rm->rm_col[c].rc_abd = abd_get_offset(abd, 0);
	- off = rm->rm_col[c].rc_size;
	+ rr->rr_col[c].rc_abd = abd_get_offset(abd, 0);
	+ off = rr->rr_col[c].rc_size;

	for (c = c + 1; c < acols; c++) {
	- rm->rm_col[c].rc_abd = abd_get_offset(abd, off);
	- off += rm->rm_col[c].rc_size;
	+ rr->rr_col[c].rc_abd = abd_get_offset(abd, off);
	+ off += rr->rr_col[c].rc_size;
	}
	}

	@@ -596,20 +625,182 @@
	* skip the first column since at least one data and one parity
	* column must appear in each row.
	*/
	- ASSERT(rm->rm_cols >= 2);
	- ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
	+ ASSERT(rr->rr_cols >= 2);
	+ ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size);
	+
	+ if (rr->rr_firstdatacol == 1 && (offset & (1ULL << 20))) {
	+ devidx = rr->rr_col[0].rc_devidx;
	+ o = rr->rr_col[0].rc_offset;
	+ rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx;
	+ rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset;
	+ rr->rr_col[1].rc_devidx = devidx;
	+ rr->rr_col[1].rc_offset = o;
	+ }
	+
	+ return (rm);
	+}
	+
	+static raidz_map_t *
	+vdev_raidz_map_alloc_expanded(abd_t *abd, uint64_t size, uint64_t offset, boolean_t dofree,
	+ uint64_t ashift, uint64_t physical_cols, uint64_t logical_cols,
	+ uint64_t nparity)
	+{
	+ /* The starting RAIDZ (parent) vdev sector of the block. */
	+ uint64_t b = offset >> ashift;
	+ /* The zio's size in units of the vdev's minimum sector size. */
	+ uint64_t s = size >> ashift;
	+ uint64_t cur_col = b % physical_cols;
	+ /* The starting byte offset on each child vdev. */
	+ uint64_t child_offset = (b / physical_cols) << ashift;
	+ uint64_t q, r, bc, devidx, asize, tot;
	+
	+ /*
	+ * "Quotient": The number of data sectors for this stripe on all but
	+ * the "big column" child vdevs that also contain "remainder" data.
	+ * AKA "full rows"
	+ */
	+ q = s / (logical_cols - nparity);
	+
	+ /*
	+ * "Remainder": The number of partial stripe data sectors in this I/O.
	+ * This will add a sector to some, but not all, child vdevs.
	+ */
	+ r = s - q * (logical_cols - nparity);
	+
	+ /* The number of "big columns" - those which contain remainder data. */
	+ bc = (r == 0 ? 0 : r + nparity);
	+
	+ /*
	+ * The total number of data and parity sectors associated with
	+ * this I/O.
	+ */
	+ tot = s + nparity * (q + (r == 0 ? 0 : 1));
	+
	+ /* How many rows contain data (not skip) */
	+ uint64_t rows = howmany(tot, logical_cols);
	+ int cols = MIN(tot, logical_cols);
	+
	+ raidz_map_t *rm =
	+ kmem_zalloc(offsetof(raidz_map_t, rm_row[rows]),
	+ KM_SLEEP);
	+ rm->rm_nrows = rows;
	+ rm->rm_nskip = roundup(tot, nparity + 1) - tot;
	+ asize = 0;

	- if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
	- devidx = rm->rm_col[0].rc_devidx;
	- o = rm->rm_col[0].rc_offset;
	- rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
	- rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
	- rm->rm_col[1].rc_devidx = devidx;
	- rm->rm_col[1].rc_offset = o;
	+ zfs_dbgmsg("rm=%p s=%d q=%d r=%d bc=%d nrows=%d cols=%d",
	+ rm, (int)s, (int)q, (int)r, (int)bc, (int)rows, (int)cols);
	+
	+ for (uint64_t row = 0; row < rows; row++) {
	+ raidz_row_t *rr = kmem_alloc(offsetof(raidz_row_t,
	+ rr_col[cols]), KM_SLEEP);
	+ rm->rm_row[row] = rr;
	+
	+ /*
	+ * We set cols to the entire width of the block, even
	+ * if this row is shorter. This is needed because parity
	+ * generation (for Q and R) needs to know the entire width,
	+ * because it treats the short row as though it was
	+ * full-width (and the "phantom" sectors were zero-filled).
	+ *
	+ * Another approach to this would be to set cols shorter
	+ * (to just the number of columns that we might do i/o to)
	+ * and have another mechanism to tell the parity generation
	+ * about the "entire width". Reconstruction (at least
	+ * vdev_raidz_reconstruct_general()) would also need to
	+ * know about the "entire width".
	+ */
	+ rr->rr_cols = cols;
	+ rr->rr_missingdata = 0;
	+ rr->rr_missingparity = 0;
	+ rr->rr_firstdatacol = nparity;
	+ rr->rr_abd_copy = NULL;
	+
	+ for (int c = 0; c < rr->rr_cols; c++, cur_col++) {
	+ if (cur_col >= physical_cols) {
	+ cur_col -= physical_cols;
	+ child_offset += 1ULL << ashift;
	+ }
	+ rr->rr_col[c].rc_devidx = cur_col;
	+ rr->rr_col[c].rc_offset = child_offset;
	+ rr->rr_col[c].rc_gdata = NULL;
	+ rr->rr_col[c].rc_orig_data = NULL;
	+ rr->rr_col[c].rc_error = 0;
	+ rr->rr_col[c].rc_tried = 0;
	+ rr->rr_col[c].rc_skipped = 0;
	+ rr->rr_col[c].rc_abd = NULL;
	+ rr->rr_col[c].rc_need_orig_restore = B_FALSE;
	+
	+ uint64_t dc = c - rr->rr_firstdatacol;
	+ if (c < rr->rr_firstdatacol) {
	+ rr->rr_col[c].rc_size = 1ULL << ashift;
	+ if (!dofree) {
	+ rr->rr_col[c].rc_abd =
	+ abd_alloc_linear(rr->rr_col[c].rc_size,
	+ B_TRUE);
	+ }
	+ } else if (row == rows - 1 && bc != 0 && c >= bc) {
	+ /*
	+ * Past the end, this for parity generation.
	+ */
	+ rr->rr_col[c].rc_size = 0;
	+ rr->rr_col[c].rc_abd = NULL;
	+ } else {
	+ /* XXX ASCII art diagram here */
	+ /* "data column" (col excluding parity) */
	+ uint64_t off;
	+
	+ if (c < bc \|\| r == 0) {
	+ off = dc * rows + row;
	+ } else {
	+ off = r * rows +
	+ (dc - r) * (rows - 1) + row;
	+ }
	+ zfs_dbgmsg("rm=%p row=%d c=%d dc=%d off=%u devidx=%u",
	+ rm, (int)row, (int)c, (int)dc, (int)off, (int)cur_col);
	+ rr->rr_col[c].rc_size = 1ULL << ashift;
	+ if (!dofree) {
	+ rr->rr_col[c].rc_abd =
	+ abd_get_offset(abd, off << ashift);
	+ }
	+ }
	+
	+ asize += rr->rr_col[c].rc_size;
	+ }
	+
	+ /*
	+ * If all data stored spans all columns, there's a danger that parity
	+ * will always be on the same device and, since parity isn't read
	+ * during normal operation, that that device's I/O bandwidth won't be
	+ * used effectively. We therefore switch the parity every 1MB.
	+ *
	+ * ... at least that was, ostensibly, the theory. As a practical
	+ * matter unless we juggle the parity between all devices evenly, we
	+ * won't see any benefit. Further, occasional writes that aren't a
	+ * multiple of the LCM of the number of children and the minimum
	+ * stripe width are sufficient to avoid pessimal behavior.
	+ * Unfortunately, this decision created an implicit on-disk format
	+ * requirement that we need to support for all eternity, but only
	+ * for single-parity RAID-Z.
	+ *
	+ * If we intend to skip a sector in the zeroth column for padding
	+ * we must make sure to note this swap. We will never intend to
	+ * skip the first column since at least one data and one parity
	+ * column must appear in each row.
	+ */
	+ if (rr->rr_firstdatacol == 1 && rr->rr_cols > 1 &&
	+ (offset & (1ULL << 20))) {
	+ ASSERT(rr->rr_cols >= 2);
	+ ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size);
	+ devidx = rr->rr_col[0].rc_devidx;
	+ uint64_t o = rr->rr_col[0].rc_offset;
	+ rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx;
	+ rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset;
	+ rr->rr_col[1].rc_devidx = devidx;
	+ rr->rr_col[1].rc_offset = o;
	+ }

	- if (rm->rm_skipstart == 0)
	- rm->rm_skipstart = 1;
	}
	+ ASSERT3U(asize, ==, tot << ashift);

	return (rm);
	}
	@@ -676,55 +867,48 @@
	}

	static void
	-vdev_raidz_generate_parity_p(raidz_map_t *rm)
	+vdev_raidz_generate_parity_p(raidz_row_t *rr)
	{
	- uint64_t *p;
	- int c;
	- abd_t *src;
	+ uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);

	- for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
	- src = rm->rm_col[c].rc_abd;
	- p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
	+ for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
	+ abd_t *src = rr->rr_col[c].rc_abd;

	- if (c == rm->rm_firstdatacol) {
	- abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
	+ if (c == rr->rr_firstdatacol) {
	+ abd_copy_to_buf(p, src, rr->rr_col[c].rc_size);
	} else {
	struct pqr_struct pqr = { p, NULL, NULL };
	- (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
	+ (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size,
	vdev_raidz_p_func, &pqr);
	}
	}
	}

	static void
	-vdev_raidz_generate_parity_pq(raidz_map_t *rm)
	+vdev_raidz_generate_parity_pq(raidz_row_t *rr)
	{
	- uint64_t p, q, pcnt, ccnt, mask, i;
	- int c;
	- abd_t *src;
	+ uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
	+ uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
	+ uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
	+ ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size ==
	+ rr->rr_col[VDEV_RAIDZ_Q].rc_size);

	- pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
	- ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
	- rm->rm_col[VDEV_RAIDZ_Q].rc_size);
	+ for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
	+ abd_t *src = rr->rr_col[c].rc_abd;

	- for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
	- src = rm->rm_col[c].rc_abd;
	- p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
	- q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
	+ uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]);

	- ccnt = rm->rm_col[c].rc_size / sizeof (p[0]);
	-
	- if (c == rm->rm_firstdatacol) {
	- abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
	- (void) memcpy(q, p, rm->rm_col[c].rc_size);
	+ if (c == rr->rr_firstdatacol) {
	+ abd_copy_to_buf(p, src, rr->rr_col[c].rc_size);
	+ (void) memcpy(q, p, rr->rr_col[c].rc_size);
	} else {
	struct pqr_struct pqr = { p, q, NULL };
	- (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
	+ (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size,
	vdev_raidz_pq_func, &pqr);
	}

	- if (c == rm->rm_firstdatacol) {
	- for (i = ccnt; i < pcnt; i++) {
	+ if (c == rr->rr_firstdatacol) {
	+ for (uint64_t i = ccnt; i < pcnt; i++) {
	p[i] = 0;
	q[i] = 0;
	}
	@@ -733,7 +917,8 @@
	* Treat short columns as though they are full of 0s.
	* Note that there's therefore nothing needed for P.
	*/
	- for (i = ccnt; i < pcnt; i++) {
	+ uint64_t mask;
	+ for (uint64_t i = ccnt; i < pcnt; i++) {
	VDEV_RAIDZ_64MUL_2(q[i], mask);
	}
	}
	@@ -741,38 +926,35 @@
	}

	static void
	-vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
	+vdev_raidz_generate_parity_pqr(raidz_row_t *rr)
	{
	- uint64_t p, q, *r, pcnt, ccnt, mask, i;
	- int c;
	- abd_t *src;
	-
	- pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
	- ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
	- rm->rm_col[VDEV_RAIDZ_Q].rc_size);
	- ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
	- rm->rm_col[VDEV_RAIDZ_R].rc_size);
	-
	- for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
	- src = rm->rm_col[c].rc_abd;
	- p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
	- q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
	- r = abd_to_buf(rm->rm_col[VDEV_RAIDZ_R].rc_abd);
	-
	- ccnt = rm->rm_col[c].rc_size / sizeof (p[0]);
	-
	- if (c == rm->rm_firstdatacol) {
	- abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
	- (void) memcpy(q, p, rm->rm_col[c].rc_size);
	- (void) memcpy(r, p, rm->rm_col[c].rc_size);
	+ uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
	+ uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
	+ uint64_t *r = abd_to_buf(rr->rr_col[VDEV_RAIDZ_R].rc_abd);
	+ uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
	+ ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size ==
	+ rr->rr_col[VDEV_RAIDZ_Q].rc_size);
	+ ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size ==
	+ rr->rr_col[VDEV_RAIDZ_R].rc_size);
	+
	+ for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
	+ abd_t *src = rr->rr_col[c].rc_abd;
	+
	+ uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]);
	+
	+ if (c == rr->rr_firstdatacol) {
	+ abd_copy_to_buf(p, src, rr->rr_col[c].rc_size);
	+ (void) memcpy(q, p, rr->rr_col[c].rc_size);
	+ (void) memcpy(r, p, rr->rr_col[c].rc_size);
	} else {
	struct pqr_struct pqr = { p, q, r };
	- (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
	+ (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size,
	vdev_raidz_pqr_func, &pqr);
	}

	- if (c == rm->rm_firstdatacol) {
	- for (i = ccnt; i < pcnt; i++) {
	+ if (c == rr->rr_firstdatacol) {
	+ for (uint64_t i = ccnt; i < pcnt; i++) {
	+ /* XXX does this really happen? firstdatacol should be the same size as the parity cols */
	p[i] = 0;
	q[i] = 0;
	r[i] = 0;
	@@ -782,7 +964,8 @@
	* Treat short columns as though they are full of 0s.
	* Note that there's therefore nothing needed for P.
	*/
	- for (i = ccnt; i < pcnt; i++) {
	+ uint64_t mask;
	+ for (uint64_t i = ccnt; i < pcnt; i++) {
	VDEV_RAIDZ_64MUL_2(q[i], mask);
	VDEV_RAIDZ_64MUL_4(r[i], mask);
	}
	@@ -795,17 +978,27 @@
	* parity columns available.
	*/
	static void
	-vdev_raidz_generate_parity(raidz_map_t *rm)
	+vdev_raidz_generate_parity(raidz_row_t *rr)
	{
	- switch (rm->rm_firstdatacol) {
	+ if (rr->rr_cols == 0) {
	+ /*
	+ * We are handling this block one row at a time (because
	+ * this block has a different logical vs physical width,
	+ * due to RAIDZ expansion), and this is a pad-only row,
	+ * which has no parity.
	+ */
	+ return;
	+ }
	+
	+ switch (rr->rr_firstdatacol) {
	case 1:
	- vdev_raidz_generate_parity_p(rm);
	+ vdev_raidz_generate_parity_p(rr);
	break;
	case 2:
	- vdev_raidz_generate_parity_pq(rm);
	+ vdev_raidz_generate_parity_pq(rr);
	break;
	case 3:
	- vdev_raidz_generate_parity_pqr(rm);
	+ vdev_raidz_generate_parity_pqr(rr);
	break;
	default:
	cmn_err(CE_PANIC, "invalid RAID-Z configuration");
	@@ -929,30 +1122,31 @@
	}

	static int
	-vdev_raidz_reconstruct_p(raidz_map_t rm, int tgts, int ntgts)
	+vdev_raidz_reconstruct_p(raidz_row_t rr, int tgts, int ntgts)
	{
	int x = tgts[0];
	- int c;
	abd_t dst, src;

	- ASSERT(ntgts == 1);
	- ASSERT(x >= rm->rm_firstdatacol);
	- ASSERT(x < rm->rm_cols);
	+ zfs_dbgmsg("reconstruct_p(rm=%p x=%u)",
	+ rr, x);
	+
	+ ASSERT3U(ntgts, ==, 1);
	+ ASSERT3U(x, >=, rr->rr_firstdatacol);
	+ ASSERT3U(x, <, rr->rr_cols);

	- ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_P].rc_size);
	- ASSERT(rm->rm_col[x].rc_size > 0);
	+ ASSERT3U(rr->rr_col[x].rc_size, <=, rr->rr_col[VDEV_RAIDZ_P].rc_size);

	- src = rm->rm_col[VDEV_RAIDZ_P].rc_abd;
	- dst = rm->rm_col[x].rc_abd;
	+ src = rr->rr_col[VDEV_RAIDZ_P].rc_abd;
	+ dst = rr->rr_col[x].rc_abd;

	- abd_copy(dst, src, rm->rm_col[x].rc_size);
	+ abd_copy(dst, src, rr->rr_col[x].rc_size);

	- for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
	- uint64_t size = MIN(rm->rm_col[x].rc_size,
	- rm->rm_col[c].rc_size);
	+ for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
	+ uint64_t size = MIN(rr->rr_col[x].rc_size,
	+ rr->rr_col[c].rc_size);

	- src = rm->rm_col[c].rc_abd;
	- dst = rm->rm_col[x].rc_abd;
	+ src = rr->rr_col[c].rc_abd;
	+ dst = rr->rr_col[x].rc_abd; /* XXX not needed, done above */

	if (c == x)
	continue;
	@@ -965,51 +1159,54 @@
	}

	static int
	-vdev_raidz_reconstruct_q(raidz_map_t rm, int tgts, int ntgts)
	+vdev_raidz_reconstruct_q(raidz_row_t rr, int tgts, int ntgts)
	{
	int x = tgts[0];
	int c, exp;
	abd_t dst, src;

	+ zfs_dbgmsg("reconstruct_q(rm=%p x=%u)",
	+ rr, x);
	+
	ASSERT(ntgts == 1);

	- ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_Q].rc_size);
	+ ASSERT(rr->rr_col[x].rc_size <= rr->rr_col[VDEV_RAIDZ_Q].rc_size);

	- for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
	- uint64_t size = (c == x) ? 0 : MIN(rm->rm_col[x].rc_size,
	- rm->rm_col[c].rc_size);
	+ for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
	+ uint64_t size = (c == x) ? 0 : MIN(rr->rr_col[x].rc_size,
	+ rr->rr_col[c].rc_size);

	- src = rm->rm_col[c].rc_abd;
	- dst = rm->rm_col[x].rc_abd;
	+ src = rr->rr_col[c].rc_abd;
	+ dst = rr->rr_col[x].rc_abd;

	- if (c == rm->rm_firstdatacol) {
	+ if (c == rr->rr_firstdatacol) {
	abd_copy(dst, src, size);
	- if (rm->rm_col[x].rc_size > size)
	+ if (rr->rr_col[x].rc_size > size)
	abd_zero_off(dst, size,
	- rm->rm_col[x].rc_size - size);
	+ rr->rr_col[x].rc_size - size);
	} else {
	- ASSERT3U(size, <=, rm->rm_col[x].rc_size);
	+ ASSERT3U(size, <=, rr->rr_col[x].rc_size);
	(void) abd_iterate_func2(dst, src, 0, 0, size,
	vdev_raidz_reconst_q_pre_func, NULL);
	(void) abd_iterate_func(dst,
	- size, rm->rm_col[x].rc_size - size,
	+ size, rr->rr_col[x].rc_size - size,
	vdev_raidz_reconst_q_pre_tail_func, NULL);
	}
	}

	- src = rm->rm_col[VDEV_RAIDZ_Q].rc_abd;
	- dst = rm->rm_col[x].rc_abd;
	- exp = 255 - (rm->rm_cols - 1 - x);
	+ src = rr->rr_col[VDEV_RAIDZ_Q].rc_abd;
	+ dst = rr->rr_col[x].rc_abd;
	+ exp = 255 - (rr->rr_cols - 1 - x);

	struct reconst_q_struct rq = { abd_to_buf(src), exp };
	- (void) abd_iterate_func(dst, 0, rm->rm_col[x].rc_size,
	+ (void) abd_iterate_func(dst, 0, rr->rr_col[x].rc_size,
	vdev_raidz_reconst_q_post_func, &rq);

	return (1 << VDEV_RAIDZ_Q);
	}

	static int
	-vdev_raidz_reconstruct_pq(raidz_map_t rm, int tgts, int ntgts)
	+vdev_raidz_reconstruct_pq(raidz_row_t rr, int tgts, int ntgts)
	{
	uint8_t p, q, pxy, qxy, tmp, a, b, aexp, bexp;
	abd_t pdata, qdata;
	@@ -1018,12 +1215,15 @@
	int y = tgts[1];
	abd_t xd, yd;

	+ zfs_dbgmsg("reconstruct_pq(rm=%p x=%u y=%u)",
	+ rr, x, y);
	+
	ASSERT(ntgts == 2);
	ASSERT(x < y);
	- ASSERT(x >= rm->rm_firstdatacol);
	- ASSERT(y < rm->rm_cols);
	+ ASSERT(x >= rr->rr_firstdatacol);
	+ ASSERT(y < rr->rr_cols);

	- ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
	+ ASSERT(rr->rr_col[x].rc_size >= rr->rr_col[y].rc_size);

	/*
	* Move the parity data aside -- we're going to compute parity as
	@@ -1032,29 +1232,29 @@
	* parity so we make those columns appear to be full of zeros by
	* setting their lengths to zero.
	*/
	- pdata = rm->rm_col[VDEV_RAIDZ_P].rc_abd;
	- qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_abd;
	- xsize = rm->rm_col[x].rc_size;
	- ysize = rm->rm_col[y].rc_size;
	+ pdata = rr->rr_col[VDEV_RAIDZ_P].rc_abd;
	+ qdata = rr->rr_col[VDEV_RAIDZ_Q].rc_abd;
	+ xsize = rr->rr_col[x].rc_size;
	+ ysize = rr->rr_col[y].rc_size;

	- rm->rm_col[VDEV_RAIDZ_P].rc_abd =
	- abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_P].rc_size, B_TRUE);
	- rm->rm_col[VDEV_RAIDZ_Q].rc_abd =
	- abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_Q].rc_size, B_TRUE);
	- rm->rm_col[x].rc_size = 0;
	- rm->rm_col[y].rc_size = 0;
	+ rr->rr_col[VDEV_RAIDZ_P].rc_abd =
	+ abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_P].rc_size, B_TRUE);
	+ rr->rr_col[VDEV_RAIDZ_Q].rc_abd =
	+ abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_Q].rc_size, B_TRUE);
	+ rr->rr_col[x].rc_size = 0;
	+ rr->rr_col[y].rc_size = 0;

	- vdev_raidz_generate_parity_pq(rm);
	+ vdev_raidz_generate_parity_pq(rr);

	- rm->rm_col[x].rc_size = xsize;
	- rm->rm_col[y].rc_size = ysize;
	+ rr->rr_col[x].rc_size = xsize;
	+ rr->rr_col[y].rc_size = ysize;

	p = abd_to_buf(pdata);
	q = abd_to_buf(qdata);
	- pxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
	- qxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
	- xd = rm->rm_col[x].rc_abd;
	- yd = rm->rm_col[y].rc_abd;
	+ pxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
	+ qxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
	+ xd = rr->rr_col[x].rc_abd;
	+ yd = rr->rr_col[y].rc_abd;

	/*
	* We now have:
	@@ -1072,7 +1272,7 @@
	*/

	a = vdev_raidz_pow2[255 + x - y];
	- b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
	+ b = vdev_raidz_pow2[255 - (rr->rr_cols - 1 - x)];
	tmp = 255 - vdev_raidz_log2[a ^ 1];

	aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
	@@ -1085,14 +1285,14 @@
	(void) abd_iterate_func(xd, ysize, xsize - ysize,
	vdev_raidz_reconst_pq_tail_func, &rpq);

	- abd_free(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
	- abd_free(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
	+ abd_free(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
	+ abd_free(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);

	/*
	* Restore the saved parity data.
	*/
	- rm->rm_col[VDEV_RAIDZ_P].rc_abd = pdata;
	- rm->rm_col[VDEV_RAIDZ_Q].rc_abd = qdata;
	+ rr->rr_col[VDEV_RAIDZ_P].rc_abd = pdata;
	+ rr->rr_col[VDEV_RAIDZ_Q].rc_abd = qdata;

	return ((1 << VDEV_RAIDZ_P) \| (1 << VDEV_RAIDZ_Q));
	}
	@@ -1249,13 +1449,13 @@
	/* END CSTYLED */

	static void
	-vdev_raidz_matrix_init(raidz_map_t rm, int n, int nmap, int map,
	+vdev_raidz_matrix_init(raidz_row_t rr, int n, int nmap, int map,
	uint8_t **rows)
	{
	int i, j;
	int pow;

	- ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
	+ ASSERT(n == rr->rr_cols - rr->rr_firstdatacol);

	/*
	* Fill in the missing rows of interest.
	@@ -1279,7 +1479,7 @@
	}

	static void
	-vdev_raidz_matrix_invert(raidz_map_t rm, int n, int nmissing, int missing,
	+vdev_raidz_matrix_invert(raidz_row_t rr, int n, int nmissing, int missing,
	uint8_t rows, uint8_t invrows, const uint8_t *used)
	{
	int i, j, ii, jj;
	@@ -1291,10 +1491,10 @@
	* correspond to data columns.
	*/
	for (i = 0; i < nmissing; i++) {
	- ASSERT3S(used[i], <, rm->rm_firstdatacol);
	+ ASSERT3S(used[i], <, rr->rr_firstdatacol);
	}
	for (; i < n; i++) {
	- ASSERT3S(used[i], >=, rm->rm_firstdatacol);
	+ ASSERT3S(used[i], >=, rr->rr_firstdatacol);
	}

	/*
	@@ -1311,8 +1511,8 @@
	*/
	for (i = 0; i < nmissing; i++) {
	for (j = nmissing; j < n; j++) {
	- ASSERT3U(used[j], >=, rm->rm_firstdatacol);
	- jj = used[j] - rm->rm_firstdatacol;
	+ ASSERT3U(used[j], >=, rr->rr_firstdatacol);
	+ jj = used[j] - rr->rr_firstdatacol;
	ASSERT3S(jj, <, n);
	invrows[i][j] = rows[i][jj];
	rows[i][jj] = 0;
	@@ -1373,7 +1573,7 @@
	}

	static void
	-vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
	+vdev_raidz_matrix_reconstruct(raidz_row_t *rr, int n, int nmissing,
	int missing, uint8_t invrows, const uint8_t used)
	{
	int i, j, x, cc, c;
	@@ -1405,22 +1605,24 @@

	for (i = 0; i < n; i++) {
	c = used[i];
	- ASSERT3U(c, <, rm->rm_cols);
	+ ASSERT3U(c, <, rr->rr_cols);

	- src = abd_to_buf(rm->rm_col[c].rc_abd);
	- ccount = rm->rm_col[c].rc_size;
	+ ccount = rr->rr_col[c].rc_size;
	+ ASSERT(ccount >= rr->rr_col[missing[0]].rc_size \|\| i > 0);
	+ if (ccount == 0)
	+ continue;
	+ src = abd_to_buf(rr->rr_col[c].rc_abd);
	for (j = 0; j < nmissing; j++) {
	- cc = missing[j] + rm->rm_firstdatacol;
	- ASSERT3U(cc, >=, rm->rm_firstdatacol);
	- ASSERT3U(cc, <, rm->rm_cols);
	+ cc = missing[j] + rr->rr_firstdatacol;
	+ ASSERT3U(cc, >=, rr->rr_firstdatacol);
	+ ASSERT3U(cc, <, rr->rr_cols);
	ASSERT3U(cc, !=, c);

	- dst[j] = abd_to_buf(rm->rm_col[cc].rc_abd);
	- dcount[j] = rm->rm_col[cc].rc_size;
	+ dcount[j] = rr->rr_col[cc].rc_size;
	+ if (dcount[j] != 0)
	+ dst[j] = abd_to_buf(rr->rr_col[cc].rc_abd);
	}

	- ASSERT(ccount >= rm->rm_col[missing[0]].rc_size \|\| i > 0);
	-
	for (x = 0; x < ccount; x++, src++) {
	if (*src != 0)
	log = vdev_raidz_log2[*src];
	@@ -1449,13 +1651,15 @@
	}

	static int
	-vdev_raidz_reconstruct_general(raidz_map_t rm, int tgts, int ntgts)
	+vdev_raidz_reconstruct_general(raidz_row_t rr, int tgts, int ntgts)
	{
	int n, i, c, t, tt;
	int nmissing_rows;
	int missing_rows[VDEV_RAIDZ_MAXPARITY];
	int parity_map[VDEV_RAIDZ_MAXPARITY];

	+ zfs_dbgmsg("reconstruct_general(rm=%p ntgts=%u)",
	+ rr, ntgts);
	uint8_t p, pp;
	size_t psize;

	@@ -1471,28 +1675,31 @@
	* Matrix reconstruction can't use scatter ABDs yet, so we allocate
	* temporary linear ABDs.
	*/
	- if (!abd_is_linear(rm->rm_col[rm->rm_firstdatacol].rc_abd)) {
	- bufs = kmem_alloc(rm->rm_cols * sizeof (abd_t *), KM_PUSHPAGE);
	+ if (!abd_is_linear(rr->rr_col[rr->rr_firstdatacol].rc_abd)) {
	+ bufs = kmem_alloc(rr->rr_cols * sizeof (abd_t *), KM_PUSHPAGE);

	- for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
	- raidz_col_t *col = &rm->rm_col[c];
	+ for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
	+ raidz_col_t *col = &rr->rr_col[c];

	bufs[c] = col->rc_abd;
	- col->rc_abd = abd_alloc_linear(col->rc_size, B_TRUE);
	- abd_copy(col->rc_abd, bufs[c], col->rc_size);
	+ if (bufs[c] != NULL) {
	+ col->rc_abd =
	+ abd_alloc_linear(col->rc_size, B_TRUE);
	+ abd_copy(col->rc_abd, bufs[c], col->rc_size);
	+ }
	}
	}

	- n = rm->rm_cols - rm->rm_firstdatacol;
	+ n = rr->rr_cols - rr->rr_firstdatacol;

	/*
	* Figure out which data columns are missing.
	*/
	nmissing_rows = 0;
	for (t = 0; t < ntgts; t++) {
	- if (tgts[t] >= rm->rm_firstdatacol) {
	+ if (tgts[t] >= rr->rr_firstdatacol) {
	missing_rows[nmissing_rows++] =
	- tgts[t] - rm->rm_firstdatacol;
	+ tgts[t] - rr->rr_firstdatacol;
	}
	}

	@@ -1502,7 +1709,7 @@
	*/
	for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
	ASSERT(tt < ntgts);
	- ASSERT(c < rm->rm_firstdatacol);
	+ ASSERT(c < rr->rr_firstdatacol);

	/*
	* Skip any targeted parity columns.
	@@ -1537,9 +1744,9 @@
	used[i] = parity_map[i];
	}

	- for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
	+ for (tt = 0, c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
	if (tt < nmissing_rows &&
	- c == missing_rows[tt] + rm->rm_firstdatacol) {
	+ c == missing_rows[tt] + rr->rr_firstdatacol) {
	tt++;
	continue;
	}
	@@ -1552,18 +1759,18 @@
	/*
	* Initialize the interesting rows of the matrix.
	*/
	- vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
	+ vdev_raidz_matrix_init(rr, n, nmissing_rows, parity_map, rows);

	/*
	* Invert the matrix.
	*/
	- vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
	+ vdev_raidz_matrix_invert(rr, n, nmissing_rows, missing_rows, rows,
	invrows, used);

	/*
	* Reconstruct the missing data using the generated matrix.
	*/
	- vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
	+ vdev_raidz_matrix_reconstruct(rr, n, nmissing_rows, missing_rows,
	invrows, used);

	kmem_free(p, psize);
	@@ -1572,21 +1779,23 @@
	* copy back from temporary linear abds and free them
	*/
	if (bufs) {
	- for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
	- raidz_col_t *col = &rm->rm_col[c];
	+ for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
	+ raidz_col_t *col = &rr->rr_col[c];

	- abd_copy(bufs[c], col->rc_abd, col->rc_size);
	- abd_free(col->rc_abd);
	+ if (bufs[c] != NULL) {
	+ abd_copy(bufs[c], col->rc_abd, col->rc_size);
	+ abd_free(col->rc_abd);
	+ }
	col->rc_abd = bufs[c];
	}
	- kmem_free(bufs, rm->rm_cols * sizeof (abd_t *));
	+ kmem_free(bufs, rr->rr_cols * sizeof (abd_t *));
	}

	return (code);
	}

	static int
	-vdev_raidz_reconstruct(raidz_map_t rm, int t, int nt)
	+vdev_raidz_reconstruct_row(raidz_row_t rr, int t, int nt)
	{
	int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
	int ntgts;
	@@ -1595,26 +1804,37 @@
	int nbadparity, nbaddata;
	int parity_valid[VDEV_RAIDZ_MAXPARITY];

	+ zfs_dbgmsg("reconstruct(rm=%p nt=%u cols=%u md=%u mp=%u)",
	+ rr, nt, (int)rr->rr_cols, (int)rr->rr_missingdata, (int)rr->rr_missingparity);
	+
	/*
	* The tgts list must already be sorted.
	*/
	+ zfs_dbgmsg("reconstruct(rm=%p t[%u]=%u)", rr, 0, t[0]);
	for (i = 1; i < nt; i++) {
	+ zfs_dbgmsg("reconstruct(rm=%p t[%u]=%u)",
	+ rr, i, t[i]);
	ASSERT(t[i] > t[i - 1]);
	}

	- nbadparity = rm->rm_firstdatacol;
	- nbaddata = rm->rm_cols - nbadparity;
	+ nbadparity = rr->rr_firstdatacol;
	+ nbaddata = rr->rr_cols - nbadparity;
	ntgts = 0;
	- for (i = 0, c = 0; c < rm->rm_cols; c++) {
	- if (c < rm->rm_firstdatacol)
	+ for (i = 0, c = 0; c < rr->rr_cols; c++) {
	+ zfs_dbgmsg("reconstruct(rm=%p col=%u devid=%u offset=%llx error=%u)",
	+ rr, c,
	+ (int)rr->rr_col[c].rc_devidx,
	+ (long long)rr->rr_col[c].rc_offset,
	+ (int)rr->rr_col[c].rc_error);
	+ if (c < rr->rr_firstdatacol)
	parity_valid[c] = B_FALSE;

	if (i < nt && c == t[i]) {
	tgts[ntgts++] = c;
	i++;
	- } else if (rm->rm_col[c].rc_error != 0) {
	+ } else if (rr->rr_col[c].rc_error != 0) {
	tgts[ntgts++] = c;
	- } else if (c >= rm->rm_firstdatacol) {
	+ } else if (c >= rr->rr_firstdatacol) {
	nbaddata--;
	} else {
	parity_valid[c] = B_TRUE;
	@@ -1635,30 +1855,30 @@
	switch (nbaddata) {
	case 1:
	if (parity_valid[VDEV_RAIDZ_P])
	- return (vdev_raidz_reconstruct_p(rm, dt, 1));
	+ return (vdev_raidz_reconstruct_p(rr, dt, 1));

	- ASSERT(rm->rm_firstdatacol > 1);
	+ ASSERT(rr->rr_firstdatacol > 1);

	if (parity_valid[VDEV_RAIDZ_Q])
	- return (vdev_raidz_reconstruct_q(rm, dt, 1));
	+ return (vdev_raidz_reconstruct_q(rr, dt, 1));

	- ASSERT(rm->rm_firstdatacol > 2);
	+ ASSERT(rr->rr_firstdatacol > 2);
	break;

	case 2:
	- ASSERT(rm->rm_firstdatacol > 1);
	+ ASSERT(rr->rr_firstdatacol > 1);

	if (parity_valid[VDEV_RAIDZ_P] &&
	parity_valid[VDEV_RAIDZ_Q])
	- return (vdev_raidz_reconstruct_pq(rm, dt, 2));
	+ return (vdev_raidz_reconstruct_pq(rr, dt, 2));

	- ASSERT(rm->rm_firstdatacol > 2);
	+ ASSERT(rr->rr_firstdatacol > 2);

	break;
	}
	}

	- code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
	+ code = vdev_raidz_reconstruct_general(rr, tgts, ntgts);
	ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
	ASSERT(code > 0);
	return (code);
	@@ -1668,8 +1888,8 @@
	vdev_raidz_open(vdev_t vd, uint64_t asize, uint64_t *max_asize,
	uint64_t logical_ashift, uint64_t physical_ashift)
	{
	- vdev_t *cvd;
	- uint64_t nparity = vd->vdev_nparity;
	+ vdev_raidz_t *vdrz = vd->vdev_tsd;
	+ uint64_t nparity = vdrz->vd_nparity;
	int c;
	int lasterror = 0;
	int numerrors = 0;
	@@ -1685,7 +1905,7 @@
	vdev_open_children(vd);

	for (c = 0; c < vd->vdev_children; c++) {
	- cvd = vd->vdev_child[c];
	+ vdev_t *cvd = vd->vdev_child[c];

	if (cvd->vdev_open_error != 0) {
	lasterror = cvd->vdev_open_error;
	@@ -1786,9 +2006,10 @@
	vdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size,
	uint64_t offset, uint64_t origoffset, boolean_t doread, boolean_t isdump)
	{
	+ vdev_raidz_t *vdrz = vd->vdev_tsd;
	vdev_t *tvd = vd->vdev_top;
	vdev_t *cvd;
	- raidz_map_t *rm;
	+ raidz_row_t *rr;
	raidz_col_t *rc;
	int c, err = 0;

	@@ -1818,15 +2039,19 @@
	*/
	abd_t *abd = abd_get_from_buf(data - (offset - origoffset),
	SPA_OLD_MAXBLOCKSIZE);
	- rm = vdev_raidz_map_alloc(abd,
	+ /*
	+ * XXX deal with dump to expanded raidz
	+ */
	+ raidz_map_t *rm = vdev_raidz_map_alloc(abd,
	SPA_OLD_MAXBLOCKSIZE, origoffset, B_FALSE, tvd->vdev_ashift,
	- vd->vdev_children, vd->vdev_nparity);
	+ vd->vdev_children, vdrz->vd_nparity);
	+ rr = rm->rm_row[0];

	coloffset = origoffset;

	- for (c = rm->rm_firstdatacol; c < rm->rm_cols;
	+ for (c = rr->rr_firstdatacol; c < rr->rr_cols;
	c++, coloffset += rc->rc_size) {
	- rc = &rm->rm_col[c];
	+ rc = &rr->rr_col[c];
	cvd = vd->vdev_child[rc->rc_devidx];

	/*
	@@ -1863,7 +2088,7 @@
	break;
	}

	- vdev_raidz_map_free(rm);
	+ vdev_raidz_row_free(rr);
	abd_put(abd);
	#endif /* KERNEL */

	@@ -1874,10 +2099,11 @@
	static uint64_t
	vdev_raidz_asize(vdev_t *vd, uint64_t psize)
	{
	+ vdev_raidz_t *vdrz = vd->vdev_tsd;
	uint64_t asize;
	uint64_t ashift = vd->vdev_top->vdev_ashift;
	- uint64_t cols = vd->vdev_children;
	- uint64_t nparity = vd->vdev_nparity;
	+ uint64_t cols = vdrz->vd_logical_width;
	+ uint64_t nparity = vdrz->vd_nparity;

	asize = ((psize - 1) >> ashift) + 1;
	asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
	@@ -1896,119 +2122,137 @@
	rc->rc_skipped = 0;
	}

	-/*
	- * Start an IO operation on a RAIDZ VDev
	- *
	- * Outline:
	- * - For write operations:
	- * 1. Generate the parity data
	- * 2. Create child zio write operations to each column's vdev, for both
	- * data and parity.
	- * 3. If the column skips any sectors for padding, create optional dummy
	- * write zio children for those areas to improve aggregation continuity.
	- * - For read operations:
	- * 1. Create child zio read operations to each data column's vdev to read
	- * the range of data required for zio.
	- * 2. If this is a scrub or resilver operation, or if any of the data
	- * vdevs have had errors, then create zio read operations to the parity
	- * columns' VDevs as well.
	- */
	static void
	-vdev_raidz_io_start(zio_t *zio)
	+vdev_raidz_io_verify(zio_t zio, raidz_row_t rr, int col)
	{
	+#if 0
	+#ifdef ZFS_DEBUG
	vdev_t *vd = zio->io_vd;
	vdev_t *tvd = vd->vdev_top;
	- vdev_t *cvd;
	- raidz_map_t *rm;
	- raidz_col_t *rc;
	- int c, i;

	- rm = vdev_raidz_map_alloc(zio->io_abd, zio->io_size, zio->io_offset,
	- zio->io_type == ZIO_TYPE_FREE,
	- tvd->vdev_ashift, vd->vdev_children,
	- vd->vdev_nparity);
	+ range_seg_t logical_rs, physical_rs;
	+ logical_rs.rs_start = zio->io_offset;
	+ logical_rs.rs_end = logical_rs.rs_start +
	+ vdev_raidz_asize(zio->io_vd, zio->io_size);

	- zio->io_vsd = rm;
	- zio->io_vsd_ops = &vdev_raidz_vsd_ops;
	+ raidz_col_t *rc = &rr->rr_col[col];
	+ vdev_t *cvd = vd->vdev_child[rc->rc_devidx];

	- ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
	+ vdev_xlate(cvd, &logical_rs, &physical_rs);
	+ ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start);
	+ ASSERT3U(rc->rc_offset, <, physical_rs.rs_end);
	+ /*
	+ * It would be nice to assert that rs_end is equal
	+ * to rc_offset + rc_size but there might be an
	+ * optional I/O at the end that is not accounted in
	+ * rc_size.
	+ */
	+ if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) {
	+ ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset +
	+ rc->rc_size + (1 << tvd->vdev_ashift));
	+ } else {
	+ ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size);
	+ }
	+#endif
	+#endif
	+}

	- if (zio->io_type == ZIO_TYPE_FREE) {
	- for (c = 0; c < rm->rm_cols; c++) {
	- rc = &rm->rm_col[c];
	- cvd = vd->vdev_child[rc->rc_devidx];
	- zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
	- rc->rc_offset, rc->rc_abd, rc->rc_size,
	- zio->io_type, zio->io_priority, 0,
	- vdev_raidz_child_done, rc));
	- }
	+static void
	+vdev_raidz_io_start_free(zio_t zio, raidz_row_t rr)
	+{
	+ vdev_t *vd = zio->io_vd;
	+ vdev_t *tvd = vd->vdev_top;

	- zio_execute(zio);
	- return;
	+ for (int c = 0; c < rr->rr_cols; c++) {
	+ raidz_col_t *rc = &rr->rr_col[c];
	+ if (rc->rc_size == 0)
	+ continue;
	+ vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
	+
	+ zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
	+ rc->rc_offset, rc->rc_abd, rc->rc_size,
	+ zio->io_type, zio->io_priority, 0,
	+ vdev_raidz_child_done, rc));
	}
	+}

	- if (zio->io_type == ZIO_TYPE_WRITE) {
	- vdev_raidz_generate_parity(rm);
	+static void
	+vdev_raidz_io_start_write(zio_t zio, raidz_row_t rr)
	+{
	+ vdev_t *vd = zio->io_vd;
	+ vdev_t *tvd = vd->vdev_top;

	- for (c = 0; c < rm->rm_cols; c++) {
	- rc = &rm->rm_col[c];
	- cvd = vd->vdev_child[rc->rc_devidx];
	- zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
	- rc->rc_offset, rc->rc_abd, rc->rc_size,
	- zio->io_type, zio->io_priority, 0,
	- vdev_raidz_child_done, rc));
	- }
	+ vdev_raidz_generate_parity(rr);

	- /*
	- * Generate optional I/Os for any skipped sectors to improve
	- * aggregation contiguity.
	- */
	- for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
	- ASSERT(c <= rm->rm_scols);
	- if (c == rm->rm_scols)
	- c = 0;
	- rc = &rm->rm_col[c];
	- cvd = vd->vdev_child[rc->rc_devidx];
	- zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
	- rc->rc_offset + rc->rc_size, NULL,
	- 1 << tvd->vdev_ashift,
	- zio->io_type, zio->io_priority,
	- ZIO_FLAG_NODATA \| ZIO_FLAG_OPTIONAL, NULL, NULL));
	- }
	+ for (int c = 0; c < rr->rr_cols; c++) {
	+ raidz_col_t *rc = &rr->rr_col[c];
	+ if (rc->rc_size == 0)
	+ continue;
	+ vdev_t *cvd = vd->vdev_child[rc->rc_devidx];

	- zio_execute(zio);
	- return;
	+ zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
	+ rc->rc_offset, rc->rc_abd, rc->rc_size,
	+ zio->io_type, zio->io_priority, 0,
	+ vdev_raidz_child_done, rc));
	+ }
	+
	+ /* XXX do this in vdev_raidz_io_start, based on nskip stored in rm
	+ */
	+#if 0
	+ /*
	+ * Generate optional I/Os for any skipped sectors to improve
	+ * aggregation contiguity.
	+ */
	+ for (int c = rr->rm_skipstart, i = 0; i < rr->rm_nskip; c++, i++) {
	+ ASSERT(c <= rr->rm_scols);
	+ if (c == rr->rm_scols)
	+ c = 0;
	+ raidz_col_t *rc = &rr->rr_col[c];
	+ vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
	+ zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
	+ rc->rc_offset + rc->rc_size, NULL,
	+ 1 << tvd->vdev_ashift,
	+ zio->io_type, zio->io_priority,
	+ ZIO_FLAG_NODATA \| ZIO_FLAG_OPTIONAL, NULL, NULL));
	}
	+#endif
	+}

	- ASSERT(zio->io_type == ZIO_TYPE_READ);
	+static void
	+vdev_raidz_io_start_read(zio_t zio, raidz_row_t rr, boolean_t forceparity)
	+{
	+ vdev_t *vd = zio->io_vd;

	/*
	* Iterate over the columns in reverse order so that we hit the parity
	* last -- any errors along the way will force us to read the parity.
	*/
	- for (c = rm->rm_cols - 1; c >= 0; c--) {
	- rc = &rm->rm_col[c];
	- cvd = vd->vdev_child[rc->rc_devidx];
	+ for (int c = rr->rr_cols - 1; c >= 0; c--) {
	+ raidz_col_t *rc = &rr->rr_col[c];
	+ if (rc->rc_size == 0)
	+ continue;
	+ vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
	if (!vdev_readable(cvd)) {
	- if (c >= rm->rm_firstdatacol)
	- rm->rm_missingdata++;
	+ if (c >= rr->rr_firstdatacol)
	+ rr->rr_missingdata++;
	else
	- rm->rm_missingparity++;
	+ rr->rr_missingparity++;
	rc->rc_error = SET_ERROR(ENXIO);
	rc->rc_tried = 1; /* don't even try */
	rc->rc_skipped = 1;
	continue;
	}
	if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
	- if (c >= rm->rm_firstdatacol)
	- rm->rm_missingdata++;
	+ if (c >= rr->rr_firstdatacol)
	+ rr->rr_missingdata++;
	else
	- rm->rm_missingparity++;
	+ rr->rr_missingparity++;
	rc->rc_error = SET_ERROR(ESTALE);
	rc->rc_skipped = 1;
	continue;
	}
	- if (c >= rm->rm_firstdatacol \|\| rm->rm_missingdata > 0 \|\|
	+ if (forceparity \|\|
	+ c >= rr->rr_firstdatacol \|\| rr->rr_missingdata > 0 \|\|
	(zio->io_flags & (ZIO_FLAG_SCRUB \| ZIO_FLAG_RESILVER))) {
	zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
	rc->rc_offset, rc->rc_abd, rc->rc_size,
	@@ -2016,6 +2260,75 @@
	vdev_raidz_child_done, rc));
	}
	}
	+}
	+
	+/*
	+ * Start an IO operation on a RAIDZ VDev
	+ *
	+ * Outline:
	+ * - For write operations:
	+ * 1. Generate the parity data
	+ * 2. Create child zio write operations to each column's vdev, for both
	+ * data and parity.
	+ * 3. If the column skips any sectors for padding, create optional dummy
	+ * write zio children for those areas to improve aggregation continuity.
	+ * - For read operations:
	+ * 1. Create child zio read operations to each data column's vdev to read
	+ * the range of data required for zio.
	+ * 2. If this is a scrub or resilver operation, or if any of the data
	+ * vdevs have had errors, then create zio read operations to the parity
	+ * columns' VDevs as well.
	+ */
	+static void
	+vdev_raidz_io_start(zio_t *zio)
	+{
	+ vdev_t *vd = zio->io_vd;
	+ vdev_t *tvd = vd->vdev_top;
	+ vdev_raidz_t *vdrz = vd->vdev_tsd;
	+ raidz_map_t *rm;
	+
	+ ASSERT(!vdrz->vn_expanding);
	+
	+ if (vdrz->vd_logical_width != vdrz->vd_physical_width) {
	+ rm = vdev_raidz_map_alloc_expanded(zio->io_abd,
	+ zio->io_size, zio->io_offset,
	+ zio->io_type == ZIO_TYPE_FREE,
	+ tvd->vdev_ashift, vdrz->vd_physical_width,
	+ vdrz->vd_logical_width, vdrz->vd_nparity);
	+ } else {
	+ rm = vdev_raidz_map_alloc(zio->io_abd,
	+ zio->io_size, zio->io_offset,
	+ zio->io_type == ZIO_TYPE_FREE,
	+ tvd->vdev_ashift, vdrz->vd_logical_width,
	+ vdrz->vd_nparity);
	+ }
	+
	+ zio->io_vsd = rm;
	+ zio->io_vsd_ops = &vdev_raidz_vsd_ops;
	+
	+ if (zio->io_type == ZIO_TYPE_FREE) {
	+ for (int i = 0; i < rm->rm_nrows; i++) {
	+ vdev_raidz_io_start_free(zio, rm->rm_row[i]);
	+ }
	+ } else if (zio->io_type == ZIO_TYPE_WRITE) {
	+ for (int i = 0; i < rm->rm_nrows; i++) {
	+ vdev_raidz_io_start_write(zio,
	+ rm->rm_row[i]);
	+ }
	+ } else {
	+ ASSERT(zio->io_type == ZIO_TYPE_READ);
	+ /*
	+ * If there are multiple rows, we will be hitting
	+ * all disks, so go ahead and read the parity so
	+ * that we are reading in decent size chunks.
	+ * XXX maybe doesn't really matter?
	+ */
	+ boolean_t forceparity = rm->rm_nrows > 1;
	+ for (int i = 0; i < rm->rm_nrows; i++) {
	+ vdev_raidz_io_start_read(zio,
	+ rm->rm_row[i], forceparity);
	+ }
	+ }

	zio_execute(zio);
	}
	@@ -2070,10 +2383,10 @@
	* Generate the parity from the data columns. If we tried and were able to
	* read the parity without error, verify that the generated parity matches the
	* data we read. If it doesn't, we fire off a checksum error. Return the
	- * number such failures.
	+ * number of such failures.
	*/
	static int
	-raidz_parity_verify(zio_t zio, raidz_map_t rm)
	+raidz_parity_verify(zio_t zio, raidz_row_t rr)
	{
	void *orig[VDEV_RAIDZ_MAXPARITY];
	int c, ret = 0;
	@@ -2086,21 +2399,29 @@
	if (checksum == ZIO_CHECKSUM_NOPARITY)
	return (ret);

	- for (c = 0; c < rm->rm_firstdatacol; c++) {
	- rc = &rm->rm_col[c];
	+ for (c = 0; c < rr->rr_firstdatacol; c++) {
	+ rc = &rr->rr_col[c];
	if (!rc->rc_tried \|\| rc->rc_error != 0)
	continue;
	orig[c] = zio_buf_alloc(rc->rc_size);
	abd_copy_to_buf(orig[c], rc->rc_abd, rc->rc_size);
	}

	- vdev_raidz_generate_parity(rm);
	+ /* XXX regenerates parity even for !tried\|\|rc_error!=0
	+ * This could cause a side effect of fixing stuff we didn't realize
	+ * was necessary (i.e. even if we return 0)
	+ */
	+ vdev_raidz_generate_parity(rr);
	+
	+ for (c = 0; c < rr->rr_firstdatacol; c++) {
	+ rc = &rr->rr_col[c];

	- for (c = 0; c < rm->rm_firstdatacol; c++) {
	- rc = &rm->rm_col[c];
	if (!rc->rc_tried \|\| rc->rc_error != 0)
	continue;
	+
	if (abd_cmp_buf(rc->rc_abd, orig[c], rc->rc_size) != 0) {
	+ zfs_dbgmsg("raidz_parity_verify found error on col=%u devidx=%u",
	+ c, (int)rc->rc_devidx);
	raidz_checksum_error(zio, rc, orig[c]);
	rc->rc_error = SET_ERROR(ECKSUM);
	ret++;
	@@ -2117,16 +2438,83 @@
	static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];

	static int
	-vdev_raidz_worst_error(raidz_map_t *rm)
	+vdev_raidz_worst_error(raidz_row_t *rr)
	{
	int error = 0;

	- for (int c = 0; c < rm->rm_cols; c++)
	- error = zio_worst_error(error, rm->rm_col[c].rc_error);
	+ for (int c = 0; c < rr->rr_cols; c++)
	+ error = zio_worst_error(error, rr->rr_col[c].rc_error);

	return (error);
	}

	+static void
	+vdev_raidz_io_done_verified(zio_t zio, raidz_row_t rr)
	+{
	+ int unexpected_errors = 0;
	+ int parity_errors = 0;
	+ int parity_untried = 0;
	+ int data_errors = 0;
	+
	+ ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
	+
	+ for (int c = 0; c < rr->rr_cols; c++) {
	+ raidz_col_t *rc = &rr->rr_col[c];
	+
	+ if (rc->rc_error) {
	+ if (c < rr->rr_firstdatacol)
	+ parity_errors++;
	+ else
	+ data_errors++;
	+
	+ if (!rc->rc_skipped)
	+ unexpected_errors++;
	+ } else if (c < rr->rr_firstdatacol && !rc->rc_tried) {
	+ parity_untried++;
	+ }
	+ }
	+
	+ /*
	+ * If we read more parity disks than were used for
	+ * reconstruction, confirm that the other parity disks produced
	+ * correct data.
	+ *
	+ * Note that we also regenerate parity when resilvering so we
	+ * can write it out to failed devices later.
	+ */
	+ zfs_dbgmsg("parity_errors=%u parity_untried=%u data_errors=%u verifying=%s",
	+ parity_errors, parity_untried, data_errors,
	+ (parity_errors + parity_untried < rr->rr_firstdatacol - data_errors) ? "yes" : "no");
	+ if (parity_errors + parity_untried <
	+ rr->rr_firstdatacol - data_errors \|\|
	+ (zio->io_flags & ZIO_FLAG_RESILVER)) {
	+ int n = raidz_parity_verify(zio, rr);
	+ unexpected_errors += n;
	+ ASSERT3U(parity_errors + n, <=, rr->rr_firstdatacol);
	+ }
	+
	+ if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
	+ (unexpected_errors > 0 \|\| (zio->io_flags & ZIO_FLAG_RESILVER))) {
	+ /*
	+ * Use the good data we have in hand to repair damaged children.
	+ */
	+ for (int c = 0; c < rr->rr_cols; c++) {
	+ raidz_col_t *rc = &rr->rr_col[c];
	+ vdev_t *vd = zio->io_vd;
	+ vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
	+
	+ if (rc->rc_error == 0 \|\| rc->rc_size == 0)
	+ continue;
	+
	+ zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
	+ rc->rc_offset, rc->rc_abd, rc->rc_size,
	+ ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
	+ ZIO_FLAG_IO_REPAIR \| (unexpected_errors ?
	+ ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
	+ }
	+ }
	+}
	+
	/*
	* Iterate over all combinations of bad data and attempt a reconstruction.
	* Note that the algorithm below is non-optimal because it doesn't take into
	@@ -2134,454 +2522,771 @@
	* triple-parity RAID-Z the reconstruction procedure is the same if column 4
	* is targeted as invalid as if columns 1 and 4 are targeted since in both
	* cases we'd only use parity information in column 0.
	+ *
	+ * The order that we find the various possible combinations of failed
	+ * disks is dictated by these rules:
	+ * - Examine each "slot" (the "i" in tgts[i])
	+ * - Try to increment this slot (tgts[i] = tgts[i] + 1)
	+ * - if we can't increment because it runs into the next slot,
	+ * reset our slot to the minimum, and examine the next slot
	+ * For example, with a 6-wide RAIDZ3, and no known errors (so we have to choose
	+ * 3 columns to reconstruct), we will generate the following sequence:
	+ *
	+ * STATE ACTION
	+ * 0 1 2 special case: skip since these are all parity
	+ * 0 1 3 first slot: reset to 0; middle slot: increment to 2
	+ * 0 2 3 first slot: increment to 1
	+ * 1 2 3 first: reset to 0; middle: reset to 1; last: increment to 4
	+ * 0 1 4 first: reset to 0; middle: increment to 2
	+ * 0 2 4 first: increment to 1
	+ * 1 2 4 first: reset to 0; middle: increment to 3
	+ * 0 3 4 first: increment to 1
	+ * 1 3 4 first: increment to 2
	+ * 2 3 4 first: reset to 0; middle: reset to 1; last: increment to 5
	+ * 0 1 5 first: reset to 0; middle: increment to 2
	+ * 0 2 5 first: increment to 1
	+ * 1 2 5 first: reset to 0; middle: increment to 3
	+ * 0 3 5 first: increment to 1
	+ * 1 3 5 first: increment to 2
	+ * 2 3 5 first: reset to 0; middle: increment to 4
	+ * 0 4 5 first: increment to 1
	+ * 1 4 5 first: increment to 2
	+ * 2 4 5 first: increment to 3
	+ * 3 4 5 done
	*/
	-static int
	-vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
	-{
	- raidz_map_t *rm = zio->io_vsd;
	- raidz_col_t *rc;
	- void *orig[VDEV_RAIDZ_MAXPARITY];
	- int tstore[VDEV_RAIDZ_MAXPARITY + 2];
	- int *tgts = &tstore[1];
	- int current, next, i, c, n;
	- int code, ret = 0;

	- ASSERT(total_errors < rm->rm_firstdatacol);
	+/*
	+ * Should this sector be considered failed for logical child ID i?
	+ * XXX comment explaining logical child ID's
	+ */
	+static boolean_t
	+raidz_simulate_failure(vdev_raidz_t vdrz, int ashift, int i, raidz_col_t rc)
	+{
	+ uint64_t sector_id =
	+ vdrz->vd_physical_width * (rc->rc_offset >> ashift) +
	+ rc->rc_devidx;
	+
	+#if 0
	+ zfs_dbgmsg("raidz_simulate_failure(pw=%u lw=%u ashift=%u i=%u rc_offset=%llx rc_devidx=%u sector_id=%u",
	+ vdrz->vd_physical_width,
	+ vdrz->vd_logical_width,
	+ ashift,
	+ i,
	+ (long long)rc->rc_offset,
	+ (int)rc->rc_devidx,
	+ (long long)sector_id);
	+#endif

	- /*
	- * This simplifies one edge condition.
	- */
	- tgts[-1] = -1;
	+ for (int w = vdrz->vd_physical_width;
	+ w >= vdrz->vd_logical_width; w--) {
	+ if (i < w) {
	+ return (sector_id % w == i);
	+ } else {
	+ i -= w;
	+ }
	+ }
	+ ASSERT(!"invalid logical child id");
	+ return (B_FALSE);
	+}

	- for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
	- /*
	- * Initialize the targets array by finding the first n columns
	- * that contain no error.
	- *
	- * If there were no data errors, we need to ensure that we're
	- * always explicitly attempting to reconstruct at least one
	- * data column. To do this, we simply push the highest target
	- * up into the data columns.
	- */
	- for (c = 0, i = 0; i < n; i++) {
	- if (i == n - 1 && data_errors == 0 &&
	- c < rm->rm_firstdatacol) {
	- c = rm->rm_firstdatacol;
	+static void
	+raidz_restore_orig_data(raidz_map_t *rm)
	+{
	+ for (int i = 0; i < rm->rm_nrows; i++) {
	+ raidz_row_t *rr = rm->rm_row[i];
	+ for (int c = 0; c < rr->rr_cols; c++) {
	+ raidz_col_t *rc = &rr->rr_col[c];
	+ if (rc->rc_need_orig_restore) {
	+ abd_copy_from_buf(rc->rc_abd,
	+ rc->rc_orig_data, rc->rc_size);
	+ rc->rc_need_orig_restore = B_FALSE;
	}
	+ }
	+ }
	+}

	- while (rm->rm_col[c].rc_error != 0) {
	- c++;
	- ASSERT3S(c, <, rm->rm_cols);
	+/*
	+ * returns EINVAL if reconstruction of the block will not be possible
	+ * returns ECKSUM if this specific reconstruction failed
	+ * returns 0 on successful reconstruction
	+ */
	+static int
	+raidz_reconstruct(zio_t zio, int ltgts, int ntgts)
	+{
	+ raidz_map_t *rm = zio->io_vsd;
	+ vdev_raidz_t *vdrz = zio->io_vd->vdev_tsd;
	+
	+ zfs_dbgmsg("raidz_reconstruct_expanded(zio=%p ltgts=%u,%u,%u ntgts=%u",
	+ zio, ltgts[0], ltgts[1], ltgts[2], ntgts);
	+
	+ /* Reconstruct each row */
	+ for (int r = 0; r < rm->rm_nrows; r++) {
	+ raidz_row_t *rr = rm->rm_row[r];
	+ int my_tgts[VDEV_RAIDZ_MAXPARITY]; /* value is child id */
	+ int t = 0;
	+ int dead = 0;
	+ int dead_data = 0;
	+
	+ zfs_dbgmsg("raidz_reconstruct_expanded(row=%u)",
	+ r);
	+
	+ for (int c = 0; c < rr->rr_cols; c++) {
	+ raidz_col_t *rc = &rr->rr_col[c];
	+ ASSERT0(rc->rc_need_orig_restore);
	+ if (rc->rc_error != 0) {
	+ dead++;
	+ if (c >= vdrz->vd_nparity)
	+ dead_data++;
	+ continue;
	+ }
	+ if (rc->rc_size == 0)
	+ continue;
	+ for (int lt = 0; lt < ntgts; lt++) {
	+ if (raidz_simulate_failure(vdrz,
	+ zio->io_vd->vdev_top->vdev_ashift,
	+ ltgts[lt], rc)) {
	+ if (rc->rc_orig_data == NULL) {
	+ rc->rc_orig_data =
	+ zio_buf_alloc(rc->rc_size);
	+ abd_copy_to_buf(rc->rc_orig_data,
	+ rc->rc_abd, rc->rc_size);
	+ }
	+ rc->rc_need_orig_restore = B_TRUE;
	+
	+ dead++;
	+ if (c >= vdrz->vd_nparity)
	+ dead_data++;
	+ my_tgts[t++] = c;
	+ zfs_dbgmsg("simulating failure of col %u devidx %u",
	+ c, (int)rc->rc_devidx);
	+ break;
	+ }
	}
	-
	- tgts[i] = c++;
	}
	-
	- /*
	- * Setting tgts[n] simplifies the other edge condition.
	- */
	- tgts[n] = rm->rm_cols;
	-
	- /*
	- * These buffers were allocated in previous iterations.
	- */
	- for (i = 0; i < n - 1; i++) {
	- ASSERT(orig[i] != NULL);
	+ if (dead > vdrz->vd_nparity) {
	+ /* reconstruction not possible */
	+ zfs_dbgmsg("reconstruction not possible; too many failures");
	+ raidz_restore_orig_data(rm);
	+ return (EINVAL);
	}
	+ rr->rr_code = 0;
	+ if (dead_data > 0)
	+ rr->rr_code = vdev_raidz_reconstruct_row(rr, my_tgts, t);
	+ }

	- orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
	+ /* Check for success */
	+ if (raidz_checksum_verify(zio) == 0) {
	+
	+ /* Reconstruction succeeded - report errors */
	+ for (int i = 0; i < rm->rm_nrows; i++) {
	+ raidz_row_t *rr = rm->rm_row[i];
	+ atomic_inc_64(&raidz_corrected[rr->rr_code]);
	+
	+ for (int c = 0; c < rr->rr_cols; c++) {
	+ raidz_col_t *rc = &rr->rr_col[c];
	+ if (rc->rc_need_orig_restore) {
	+ /*
	+ * Note: if this is a parity column,
	+ * we don't really know if it's wrong.
	+ * We need to let
	+ * vdev_raidz_io_done_verified() check
	+ * it, and if we set rc_error, it will
	+ * think that it is a "known" error
	+ * that doesn't need to be checked
	+ * or corrected.
	+ */
	+ if (rc->rc_error == 0 && c >= rr->rr_firstdatacol) {
	+ raidz_checksum_error(zio, rc, rc->rc_gdata);
	+ rc->rc_error = SET_ERROR(ECKSUM);
	+ }
	+ rc->rc_need_orig_restore = B_FALSE;
	+ }
	+ }

	- current = 0;
	- next = tgts[current];
	+ vdev_raidz_io_done_verified(zio, rr);
	+ }

	- while (current != n) {
	- tgts[current] = next;
	- current = 0;
	+ zio_checksum_verified(zio);

	- /*
	- * Save off the original data that we're going to
	- * attempt to reconstruct.
	- */
	- for (i = 0; i < n; i++) {
	- ASSERT(orig[i] != NULL);
	- c = tgts[i];
	- ASSERT3S(c, >=, 0);
	- ASSERT3S(c, <, rm->rm_cols);
	- rc = &rm->rm_col[c];
	- abd_copy_to_buf(orig[i], rc->rc_abd,
	- rc->rc_size);
	- }
	+ zfs_dbgmsg("reconstruction successful (checksum verified)");
	+ return (0);
	+ }

	- /*
	- * Attempt a reconstruction and exit the outer loop on
	- * success.
	- */
	- code = vdev_raidz_reconstruct(rm, tgts, n);
	- if (raidz_checksum_verify(zio) == 0) {
	- atomic_inc_64(&raidz_corrected[code]);
	-
	- for (i = 0; i < n; i++) {
	- c = tgts[i];
	- rc = &rm->rm_col[c];
	- ASSERT(rc->rc_error == 0);
	- if (rc->rc_tried)
	- raidz_checksum_error(zio, rc,
	- orig[i]);
	- rc->rc_error = SET_ERROR(ECKSUM);
	- }
	+ /* Reconstruction failed - restore original data */
	+ raidz_restore_orig_data(rm);
	+ zfs_dbgmsg("raidz_reconstruct_expanded(zio=%p) checksum failed",
	+ zio);
	+ return (ECKSUM);
	+}

	- ret = code;
	- goto done;
	- }
	+/*
	+ * return 0 on success, ECKSUM on failure
	+ */
	+static int
	+vdev_raidz_combrec(zio_t *zio)
	+{
	+ raidz_map_t *rm = zio->io_vsd;
	+ vdev_raidz_t *vdrz = zio->io_vd->vdev_tsd;
	+
	+ for (int num_failures = 1; num_failures <= vdrz->vd_nparity;
	+ num_failures++) {
	+ int tstore[VDEV_RAIDZ_MAXPARITY + 2];
	+ int ltgts = &tstore[1]; / value is logical child ID */
	+
	+ /* Determine number of logical children, n */
	+ int n = 0;
	+ for (int w = vdrz->vd_physical_width;
	+ w >= vdrz->vd_logical_width; w--) {
	+ n += w;
	+ }

	- /*
	- * Restore the original data.
	- */
	- for (i = 0; i < n; i++) {
	- c = tgts[i];
	- rc = &rm->rm_col[c];
	- abd_copy_from_buf(rc->rc_abd, orig[i],
	- rc->rc_size);
	- }
	+ ASSERT3U(num_failures, <=, vdrz->vd_nparity);
	+ ASSERT3U(num_failures, <=, VDEV_RAIDZ_MAXPARITY);
	+ /* handle corner cases in combrec logic */
	+ ltgts[-1] = -1;
	+ for (int i = 0; i < num_failures; i++) {
	+ ltgts[i] = i;
	+ }
	+ ltgts[num_failures] = n;

	- do {
	+ for (;;) {
	+ int err = raidz_reconstruct(zio,
	+ ltgts, num_failures);
	+ if (err == EINVAL) {
	/*
	- * Find the next valid column after the current
	- * position..
	+ * Reconstruction not possible with this #
	+ * failures; try more failures.
	*/
	- for (next = tgts[current] + 1;
	- next < rm->rm_cols &&
	- rm->rm_col[next].rc_error != 0; next++)
	- continue;
	+ break;
	+ } else if (err == 0)
	+ return (0);
	+
	+ /* Compute next targets to try */
	+ for (int t = 0; ; t++) {
	+ ASSERT3U(t, <, num_failures);
	+ ltgts[t]++;
	+ if (ltgts[t] == n) {
	+ ASSERT3U(t, ==, num_failures - 1);
	+ zfs_dbgmsg("reconstruction failed for num_failures=%u; tried all combinations",
	+ num_failures);
	+ break; // try more failures
	+ }

	- ASSERT(next <= tgts[current + 1]);
	+ ASSERT3U(ltgts[t], <, n);
	+ ASSERT3U(ltgts[t], <=, ltgts[t + 1]);

	/*
	* If that spot is available, we're done here.
	*/
	- if (next != tgts[current + 1])
	- break;
	+ if (ltgts[t] != ltgts[t + 1])
	+ break; // found next combination

	/*
	- * Otherwise, find the next valid column after
	- * the previous position.
	+ * Otherwise, reset this tgt to the minimum,
	+ * and move on to the next tgt.
	*/
	- for (c = tgts[current - 1] + 1;
	- rm->rm_col[c].rc_error != 0; c++)
	- continue;
	-
	- tgts[current] = c;
	- current++;
	-
	- } while (current != n);
	+ ltgts[t] = ltgts[t - 1] + 1;
	+ ASSERT3U(ltgts[t], ==, t);
	+ }
	+ if (ltgts[num_failures - 1] == n)
	+ break; // try more failures
	}
	}
	- n--;
	-done:
	- for (i = 0; i < n; i++) {
	- zio_buf_free(orig[i], rm->rm_col[0].rc_size);
	- }
	-
	- return (ret);
	+ zfs_dbgmsg("reconstruction failed for all num_failures");
	+ return (ECKSUM);
	}

	/*
	- * Complete an IO operation on a RAIDZ VDev
	+ * Complete a write IO operation on a RAIDZ VDev
	*
	* Outline:
	- * - For write operations:
	* 1. Check for errors on the child IOs.
	* 2. Return, setting an error code if too few child VDevs were written
	* to reconstruct the data later. Note that partial writes are
	* considered successful if they can be reconstructed at all.
	- * - For read operations:
	- * 1. Check for errors on the child IOs.
	- * 2. If data errors occurred:
	- * a. Try to reassemble the data from the parity available.
	- * b. If we haven't yet read the parity drives, read them now.
	- * c. If all parity drives have been read but the data still doesn't
	- * reassemble with a correct checksum, then try combinatorial
	- * reconstruction.
	- * d. If that doesn't work, return an error.
	- * 3. If there were unexpected errors or this is a resilver operation,
	- * rewrite the vdevs that had errors.
	*/
	static void
	-vdev_raidz_io_done(zio_t *zio)
	+vdev_raidz_io_done_write_impl(zio_t zio, raidz_row_t rr)
	+{
	+ int total_errors = 0;
	+
	+ ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol);
	+ ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol);
	+ ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
	+
	+ for (int c = 0; c < rr->rr_cols; c++) {
	+ raidz_col_t *rc = &rr->rr_col[c];
	+
	+ if (rc->rc_error) {
	+ ASSERT(rc->rc_error != ECKSUM); /* child has no bp */
	+
	+ total_errors++;
	+ }
	+ }
	+
	+ /*
	+ * XXX -- for now, treat partial writes as a success.
	+ * (If we couldn't write enough columns to reconstruct
	+ * the data, the I/O failed. Otherwise, good enough.)
	+ *
	+ * Now that we support write reallocation, it would be better
	+ * to treat partial failure as real failure unless there are
	+ * no non-degraded top-level vdevs left, and not update DTLs
	+ * if we intend to reallocate.
	+ */
	+ /* XXPOLICY */
	+ if (total_errors > rr->rr_firstdatacol) {
	+ zio->io_error = zio_worst_error(zio->io_error,
	+ vdev_raidz_worst_error(rr));
	+ }
	+}
	+
	+/*
	+ * return 0 if no reconstruction occurred, otherwise the "code" from
	+ * vdev_raidz_reconstruct().
	+ */
	+static int
	+vdev_raidz_io_done_reconstruct_known_missing(zio_t zio, raidz_row_t rr)
	{
	- vdev_t *vd = zio->io_vd;
	- vdev_t *cvd;
	- raidz_map_t *rm = zio->io_vsd;
	- raidz_col_t *rc;
	- int unexpected_errors = 0;
	int parity_errors = 0;
	int parity_untried = 0;
	int data_errors = 0;
	int total_errors = 0;
	- int n, c;
	- int tgts[VDEV_RAIDZ_MAXPARITY];
	- int code;
	-
	- ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */
	+ int code = 0;

	- ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
	- ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
	+ ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol);
	+ ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol);
	+ ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);

	- for (c = 0; c < rm->rm_cols; c++) {
	- rc = &rm->rm_col[c];
	+ for (int c = 0; c < rr->rr_cols; c++) {
	+ raidz_col_t *rc = &rr->rr_col[c];

	if (rc->rc_error) {
	ASSERT(rc->rc_error != ECKSUM); /* child has no bp */

	- if (c < rm->rm_firstdatacol)
	+ if (c < rr->rr_firstdatacol)
	parity_errors++;
	else
	data_errors++;

	- if (!rc->rc_skipped)
	- unexpected_errors++;
	-
	total_errors++;
	- } else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
	+ } else if (c < rr->rr_firstdatacol && !rc->rc_tried) {
	parity_untried++;
	}
	}

	- if (zio->io_type == ZIO_TYPE_WRITE) {
	+ /*
	+ * If there were data errors and the number of errors we saw was
	+ * correctable -- less than or equal to the number of parity disks read
	+ * -- reconstruct based on the missing data.
	+ */
	+ if (data_errors != 0 &&
	+ total_errors <= rr->rr_firstdatacol - parity_untried) {
	/*
	- * XXX -- for now, treat partial writes as a success.
	- * (If we couldn't write enough columns to reconstruct
	- * the data, the I/O failed. Otherwise, good enough.)
	- *
	- * Now that we support write reallocation, it would be better
	- * to treat partial failure as real failure unless there are
	- * no non-degraded top-level vdevs left, and not update DTLs
	- * if we intend to reallocate.
	+ * We either attempt to read all the parity columns or
	+ * none of them. If we didn't try to read parity, we
	+ * wouldn't be here in the correctable case. There must
	+ * also have been fewer parity errors than parity
	+ * columns or, again, we wouldn't be in this code path.
	*/
	- /* XXPOLICY */
	- if (total_errors > rm->rm_firstdatacol)
	- zio->io_error = vdev_raidz_worst_error(rm);
	+ ASSERT(parity_untried == 0);
	+ ASSERT(parity_errors < rr->rr_firstdatacol);

	- return;
	- } else if (zio->io_type == ZIO_TYPE_FREE) {
	- return;
	+ /*
	+ * Identify the data columns that reported an error.
	+ */
	+ int n = 0;
	+ int tgts[VDEV_RAIDZ_MAXPARITY];
	+ for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
	+ raidz_col_t *rc = &rr->rr_col[c];
	+ if (rc->rc_error != 0) {
	+ ASSERT(n < VDEV_RAIDZ_MAXPARITY);
	+ tgts[n++] = c;
	+ }
	+ }
	+
	+ ASSERT(rr->rr_firstdatacol >= n);
	+
	+ code = vdev_raidz_reconstruct_row(rr, tgts, n);
	}

	- ASSERT(zio->io_type == ZIO_TYPE_READ);
	- /*
	- * There are three potential phases for a read:
	- * 1. produce valid data from the columns read
	- * 2. read all disks and try again
	- * 3. perform combinatorial reconstruction
	- *
	- * Each phase is progressively both more expensive and less likely to
	- * occur. If we encounter more errors than we can repair or all phases
	- * fail, we have no choice but to return an error.
	- */
	+ return (code);
	+}

	- /*
	- * If the number of errors we saw was correctable -- less than or equal
	- * to the number of parity disks read -- attempt to produce data that
	- * has a valid checksum. Naturally, this case applies in the absence of
	- * any errors.
	- */
	- if (total_errors <= rm->rm_firstdatacol - parity_untried) {
	- if (data_errors == 0) {
	- if (raidz_checksum_verify(zio) == 0) {
	- /*
	- * If we read parity information (unnecessarily
	- * as it happens since no reconstruction was
	- * needed) regenerate and verify the parity.
	- * We also regenerate parity when resilvering
	- * so we can write it out to the failed device
	- * later.
	- */
	- if (parity_errors + parity_untried <
	- rm->rm_firstdatacol \|\|
	- (zio->io_flags & ZIO_FLAG_RESILVER)) {
	- n = raidz_parity_verify(zio, rm);
	- unexpected_errors += n;
	- ASSERT(parity_errors + n <=
	- rm->rm_firstdatacol);
	- }
	- goto done;
	+/*
	+ * return the number of reads issued.
	+ */
	+static int
	+vdev_raidz_read_all(zio_t zio, raidz_row_t rr)
	+{
	+ vdev_t *vd = zio->io_vd;
	+ int nread = 0;
	+
	+ rr->rr_missingdata = 0;
	+ rr->rr_missingparity = 0;
	+
	+ for (int c = 0; c < rr->rr_cols; c++) {
	+ raidz_col_t *rc = &rr->rr_col[c];
	+ if (rc->rc_tried \|\| rc->rc_size == 0)
	+ continue;
	+
	+ zio_nowait(zio_vdev_child_io(zio, NULL,
	+ vd->vdev_child[rc->rc_devidx],
	+ rc->rc_offset, rc->rc_abd, rc->rc_size,
	+ zio->io_type, zio->io_priority, 0,
	+ vdev_raidz_child_done, rc));
	+ nread++;
	+ }
	+ return (nread);
	+}
	+
	+static void
	+vdev_raidz_io_done(zio_t *zio)
	+{
	+ raidz_map_t *rm = zio->io_vsd;
	+ vdev_raidz_t *vdrz = zio->io_vd->vdev_tsd;
	+
	+ ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */
	+ if (zio->io_type == ZIO_TYPE_WRITE) {
	+ for (int i = 0; i < rm->rm_nrows; i++) {
	+ vdev_raidz_io_done_write_impl(zio, rm->rm_row[i]);
	+ }
	+ } else if (zio->io_type == ZIO_TYPE_FREE) {
	+ return;
	+ } else {
	+ for (int i = 0; i < rm->rm_nrows; i++) {
	+ raidz_row_t *rr = rm->rm_row[i];
	+ rr->rr_code =
	+ vdev_raidz_io_done_reconstruct_known_missing(zio,
	+ rr);
	+ }
	+
	+ if (raidz_checksum_verify(zio) == 0) {
	+ for (int i = 0; i < rm->rm_nrows; i++) {
	+ raidz_row_t *rr = rm->rm_row[i];
	+ atomic_inc_64(&raidz_corrected[rr->rr_code]);
	+ vdev_raidz_io_done_verified(zio, rr);
	}
	+ zio_checksum_verified(zio);
	} else {
	/*
	- * We either attempt to read all the parity columns or
	- * none of them. If we didn't try to read parity, we
	- * wouldn't be here in the correctable case. There must
	- * also have been fewer parity errors than parity
	- * columns or, again, we wouldn't be in this code path.
	+ * This isn't a typical situation -- either we got a
	+ * read error or a child silently returned bad data.
	+ * Read every block so we can try again with as much
	+ * data and parity as we can track down. If we've
	+ * already been through once before, all children will
	+ * be marked as tried so we'll proceed to combinatorial
	+ * reconstruction.
	*/
	- ASSERT(parity_untried == 0);
	- ASSERT(parity_errors < rm->rm_firstdatacol);
	-
	+ int nread = 0;
	+ for (int i = 0; i < rm->rm_nrows; i++) {
	+ nread += vdev_raidz_read_all(zio,
	+ rm->rm_row[i]);
	+ }
	+ if (nread != 0) {
	+ /*
	+ * Normally our stage is VDEV_IO_DONE, but if
	+ * we've already called redone(), it will have
	+ * changed to VDEV_IO_START, in which case we
	+ * don't want to call redone() again.
	+ */
	+ if (zio->io_stage != ZIO_STAGE_VDEV_IO_START)
	+ zio_vdev_io_redone(zio);
	+ return;
	+ }
	/*
	- * Identify the data columns that reported an error.
	+ * It would be too expensive to try every possible
	+ * combination of failed sectors in every row, so
	+ * instead we try every combination of failed current or
	+ * past physical disk. This means that if the incorrect
	+ * sectors were all on Nparity disks at any point in the
	+ * past, we will find the correct data. I think that
	+ * the only case where this is less durable than
	+ * a non-expanded RAIDZ, is if we have a silent
	+ * failure during expansion. In that case, one block
	+ * could be partially in the old format and partially
	+ * in the new format, so we'd lost some sectors
	+ * from the old format and some from the new format.
	+ *
	+ * e.g. logical_width=4 physical_width=6
	+ * the 15 (6+5+4) possible failed disks are:
	+ * width=6 child=0
	+ * width=6 child=1
	+ * width=6 child=2
	+ * width=6 child=3
	+ * width=6 child=4
	+ * width=6 child=5
	+ * width=5 child=0
	+ * width=5 child=1
	+ * width=5 child=2
	+ * width=5 child=3
	+ * width=5 child=4
	+ * width=4 child=0
	+ * width=4 child=1
	+ * width=4 child=2
	+ * width=4 child=3
	+ * And we will try every combination of Nparity of these
	+ * failing.
	+ *
	+ * As a first pass, we can generate every combo,
	+ * and try reconstructing, ignoring any known
	+ * failures. If any row has too many known + simulated
	+ * failures, then we bail on reconstructing with this
	+ * number of simulated failures. As an improvement,
	+ * we could detect the number of whole known failures
	+ * (i.e. we have known failures on these disks for
	+ * every row; the disks never succeeded), and
	+ * subtract that from the max # failures to simulate.
	+ * We could go even further like the current
	+ * combrec code, but that doesn't seem like it
	+ * gains us very much. If we simulate a failure
	+ * that is also a known failure, that's fine.
	*/
	- n = 0;
	- for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
	- rc = &rm->rm_col[c];
	- if (rc->rc_error != 0) {
	- ASSERT(n < VDEV_RAIDZ_MAXPARITY);
	- tgts[n++] = c;
	- }
	- }
	-
	- ASSERT(rm->rm_firstdatacol >= n);
	-
	- code = vdev_raidz_reconstruct(rm, tgts, n);
	-
	- if (raidz_checksum_verify(zio) == 0) {
	- atomic_inc_64(&raidz_corrected[code]);
	-
	+ if (vdev_raidz_combrec(zio) != 0) {
	/*
	- * If we read more parity disks than were used
	- * for reconstruction, confirm that the other
	- * parity disks produced correct data. This
	- * routine is suboptimal in that it regenerates
	- * the parity that we already used in addition
	- * to the parity that we're attempting to
	- * verify, but this should be a relatively
	- * uncommon case, and can be optimized if it
	- * becomes a problem. Note that we regenerate
	- * parity when resilvering so we can write it
	- * out to failed devices later.
	+ * We're here because either:
	+ *
	+ * total_errors == rm_first_datacol, or
	+ * vdev_raidz_combrec() failed
	+ *
	+ * In either case, there is enough bad data to prevent
	+ * reconstruction.
	+ *
	+ * Start checksum ereports for all children which haven't
	+ * failed, and the IO wasn't speculative.
	*/
	- if (parity_errors < rm->rm_firstdatacol - n \|\|
	- (zio->io_flags & ZIO_FLAG_RESILVER)) {
	- n = raidz_parity_verify(zio, rm);
	- unexpected_errors += n;
	- ASSERT(parity_errors + n <=
	- rm->rm_firstdatacol);
	+ zio->io_error = SET_ERROR(ECKSUM);
	+
	+ if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
	+ for (int i = 0; i < rm->rm_nrows; i++) {
	+ raidz_row_t *rr = rm->rm_row[i];
	+ for (int c = 0; c < rr->rr_cols; c++) {
	+ raidz_col_t *rc = &rr->rr_col[c];
	+ if (rc->rc_error == 0) {
	+ zio_bad_cksum_t zbc;
	+ zbc.zbc_has_cksum = 0;
	+ zbc.zbc_injected =
	+ rm->rm_ecksuminjected;
	+
	+ zfs_ereport_start_checksum(
	+ zio->io_spa,
	+ zio->io_vd->vdev_child[rc->rc_devidx],
	+ zio, rc->rc_offset, rc->rc_size,
	+ (void *)(uintptr_t)c, &zbc);
	+ }
	+ }
	+ }
	}
	-
	- goto done;
	}
	}
	}
	+ ASSERT(!vdrz->vn_expanding);
	+}
	+
	+static void
	+vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
	+{
	+ vdev_raidz_t *vdrz = vd->vdev_tsd;
	+ if (faulted > vdrz->vd_nparity)
	+ vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
	+ VDEV_AUX_NO_REPLICAS);
	+ else if (degraded + faulted != 0)
	+ vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
	+ else
	+ vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
	+}
	+
	+static void
	+raidz_copy_range(void *arg, uint64_t start, uint64_t size)
	+{
	+ vdev_t *vd = arg;
	+ int ashift = vd->vdev_top->vdev_ashift;
	+ int old_children = vd->vdev_children - 1;
	+ spa_t *spa = vd->vdev_spa;
	+
	+ ASSERT(IS_P2ALIGNED(start, 1 << ashift));
	+ ASSERT(IS_P2ALIGNED(size, 1 << ashift));
	+
	+ abd_t *abd = abd_alloc_for_io(1 << ashift, B_FALSE);
	+ for (uint64_t i = MAX(start >> ashift, old_children);
	+ i < (start + size) >> ashift; i++) {
	+ int child = i % old_children;
	+ int offset = (i / old_children) << ashift;
	+ spa_config_enter(spa, SCL_STATE, spa, RW_READER);
	+ VERIFY0(zio_wait(zio_read_phys(NULL,
	+ vd->vdev_child[child],
	+ offset + VDEV_LABEL_START_SIZE,
	+ 1 << ashift, abd,
	+ ZIO_CHECKSUM_OFF, NULL, NULL,
	+ ZIO_PRIORITY_REMOVAL, 0, B_FALSE)));
	+
	+ child = i % vd->vdev_children;
	+ offset = (i / vd->vdev_children) << ashift;
	+ VERIFY0(zio_wait(zio_write_phys(NULL,
	+ vd->vdev_child[child],
	+ offset + VDEV_LABEL_START_SIZE,
	+ 1 << ashift, abd,
	+ ZIO_CHECKSUM_OFF, NULL, NULL,
	+ ZIO_PRIORITY_REMOVAL, 0, B_FALSE)));
	+ spa_config_exit(spa, SCL_STATE, spa);
	+ }
	+ abd_free(abd);
	+}
	+
	+void
	+vdev_raidz_attach_sync(void arg, dmu_tx_t tx)
	+{
	+ vdev_t *new_child = arg;
	+ spa_t *spa = new_child->vdev_spa;
	+ vdev_t *raidvd = new_child->vdev_parent;
	+ vdev_raidz_t *vdrz = raidvd->vdev_tsd;
	+ ASSERT3P(raidvd->vdev_ops, ==, &vdev_raidz_ops);
	+ ASSERT3P(raidvd->vdev_top, ==, raidvd);
	+ ASSERT3U(raidvd->vdev_children, >, vdrz->vd_logical_width);
	+ ASSERT3U(raidvd->vdev_children, ==, vdrz->vd_physical_width + 1);

	/*
	- * This isn't a typical situation -- either we got a read error or
	- * a child silently returned bad data. Read every block so we can
	- * try again with as much data and parity as we can track down. If
	- * we've already been through once before, all children will be marked
	- * as tried so we'll proceed to combinatorial reconstruction.
	+ * XXX assuming that no other i/o takes place while this is happening,
	+ * until we increment physical_width. But ZIL could do i/o.
	*/
	- unexpected_errors = 1;
	- rm->rm_missingdata = 0;
	- rm->rm_missingparity = 0;
	+ vdrz->vn_expanding = B_TRUE;

	- for (c = 0; c < rm->rm_cols; c++) {
	- if (rm->rm_col[c].rc_tried)
	- continue;
	+ /spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);/

	- zio_vdev_io_redone(zio);
	- do {
	- rc = &rm->rm_col[c];
	- if (rc->rc_tried)
	- continue;
	- zio_nowait(zio_vdev_child_io(zio, NULL,
	- vd->vdev_child[rc->rc_devidx],
	- rc->rc_offset, rc->rc_abd, rc->rc_size,
	- zio->io_type, zio->io_priority, 0,
	- vdev_raidz_child_done, rc));
	- } while (++c < rm->rm_cols);
	+ range_tree_t *rt = range_tree_create(NULL, NULL);

	- return;
	+ for (uint64_t i = 0; i < raidvd->vdev_ms_count; i++) {
	+ metaslab_t *msp = raidvd->vdev_ms[i];
	+
	+ /vdev_initialize_ms_mark(msp);/
	+ mutex_enter(&msp->ms_lock);
	+
	+ metaslab_load_wait(msp);
	+ if (!msp->ms_loaded)
	+ VERIFY0(metaslab_load(msp));
	+
	+ /*
	+ * We want to copy everything except the free (allocatable)
	+ * space. Note that there may be a little bit more free
	+ * space (e.g. in ms_defer), and it's fine to copy that too.
	+ */
	+ ASSERT(range_tree_is_empty(rt));
	+ range_tree_add(rt, msp->ms_start, msp->ms_size);
	+ range_tree_walk(msp->ms_allocatable, range_tree_remove, rt);
	+ mutex_exit(&msp->ms_lock);
	+
	+ /spa_config_exit(spa, SCL_CONFIG, FTAG);/
	+ /* Note, _vacate() doesn't visit in order */
	+ range_tree_walk(rt, raidz_copy_range, raidvd);
	+ range_tree_vacate(rt, NULL, NULL);
	+ /vdev_initialize_ms_unmark(msp);/
	+ /spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);/
	}

	+ /spa_config_exit(spa, SCL_CONFIG, FTAG);/
	+ range_tree_destroy(rt);
	+
	+ vdrz->vd_physical_width++;
	+
	+#if 0
	+ raidvd->vdev_expanding = B_TRUE;
	+ vdev_reopen(raidvd);
	+ raidvd->vdev_expanding = B_FALSE;
	+#endif
	+
	+ vdrz->vn_expanding = B_FALSE;
	+ /* Ensure that widths get written to label config */
	+ vdev_config_dirty(raidvd);
	+}
	+
	+/*
	+ * Add RAIDZ-specific fields to the config nvlist.
	+ * XXX add this to vdev_ops_t?
	+ */
	+void
	+vdev_raidz_config_generate(vdev_t vd, nvlist_t nv)
	+{
	+ spa_t *spa = vd->vdev_spa;
	+ ASSERT3P(vd->vdev_ops, ==, &vdev_raidz_ops);
	+ vdev_raidz_t *vdrz = vd->vdev_tsd;
	+
	+ /*
	+ * Make sure someone hasn't managed to sneak a fancy new vdev
	+ * into a crufty old storage pool.
	+ */
	+ ASSERT(vdrz->vd_nparity == 1 \|\|
	+ (vdrz->vd_nparity <= 2 &&
	+ spa_version(spa) >= SPA_VERSION_RAIDZ2) \|\|
	+ (vdrz->vd_nparity <= 3 &&
	+ spa_version(spa) >= SPA_VERSION_RAIDZ3));
	+
	/*
	- * At this point we've attempted to reconstruct the data given the
	- * errors we detected, and we've attempted to read all columns. There
	- * must, therefore, be one or more additional problems -- silent errors
	- * resulting in invalid data rather than explicit I/O errors resulting
	- * in absent data. We check if there is enough additional data to
	- * possibly reconstruct the data and then perform combinatorial
	- * reconstruction over all possible combinations. If that fails,
	- * we're cooked.
	+ * Note that we'll add these even on storage pools where they
	+ * aren't strictly required -- older software will just ignore
	+ * it.
	*/
	- if (total_errors > rm->rm_firstdatacol) {
	- zio->io_error = vdev_raidz_worst_error(rm);
	+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vdrz->vd_nparity);
	+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_RAIDZ_LOGICAL_WIDTH,
	+ vdrz->vd_logical_width);
	+}
	+
	+/*
	+ * Set RAIDZ-specific fields in the vdev_t, based on the config.
	+ * Can't assume that anything about the vdev_t is already set.
	+ * XXX add this to vdev_ops_t?
	+ */
	+void *
	+vdev_raidz_get_tsd(spa_t spa, nvlist_t nv)
	+{
	+ uint64_t nparity, lw;
	+ vdev_raidz_t vdrz = kmem_zalloc(sizeof (vdrz), KM_SLEEP);
	+
	+ uint_t children;
	+ nvlist_t **child;
	+ int error = nvlist_lookup_nvlist_array(nv,
	+ ZPOOL_CONFIG_CHILDREN, &child, &children);
	+ if (error != 0)
	+ goto out;
	+
	+ vdrz->vd_logical_width = children;
	+ vdrz->vd_physical_width = children;
	+
	+ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RAIDZ_LOGICAL_WIDTH,
	+ &lw) == 0) {
	+ vdrz->vd_logical_width = lw;
	+ }

	- } else if (total_errors < rm->rm_firstdatacol &&
	- (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
	+ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY,
	+ &nparity) == 0) {
	+ if (nparity == 0 \|\| nparity > VDEV_RAIDZ_MAXPARITY)
	+ goto out;
	/*
	- * If we didn't use all the available parity for the
	- * combinatorial reconstruction, verify that the remaining
	- * parity is correct.
	+ * Previous versions could only support 1 or 2 parity
	+ * device.
	*/
	- if (code != (1 << rm->rm_firstdatacol) - 1)
	- (void) raidz_parity_verify(zio, rm);
	+ if (nparity > 1 &&
	+ spa_version(spa) < SPA_VERSION_RAIDZ2)
	+ goto out;
	+ if (nparity > 2 &&
	+ spa_version(spa) < SPA_VERSION_RAIDZ3)
	+ goto out;
	} else {
	/*
	- * We're here because either:
	- *
	- * total_errors == rm_first_datacol, or
	- * vdev_raidz_combrec() failed
	- *
	- * In either case, there is enough bad data to prevent
	- * reconstruction.
	- *
	- * Start checksum ereports for all children which haven't
	- * failed, and the IO wasn't speculative.
	+ * We require the parity to be specified for SPAs that
	+ * support multiple parity levels.
	*/
	- zio->io_error = SET_ERROR(ECKSUM);
	-
	- if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
	- for (c = 0; c < rm->rm_cols; c++) {
	- rc = &rm->rm_col[c];
	- if (rc->rc_error == 0) {
	- zio_bad_cksum_t zbc;
	- zbc.zbc_has_cksum = 0;
	- zbc.zbc_injected =
	- rm->rm_ecksuminjected;
	-
	- zfs_ereport_start_checksum(
	- zio->io_spa,
	- vd->vdev_child[rc->rc_devidx],
	- zio, rc->rc_offset, rc->rc_size,
	- (void *)(uintptr_t)c, &zbc);
	- }
	- }
	- }
	- }
	-
	-done:
	- zio_checksum_verified(zio);
	-
	- if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
	- (unexpected_errors \|\| (zio->io_flags & ZIO_FLAG_RESILVER))) {
	+ if (spa_version(spa) >= SPA_VERSION_RAIDZ2)
	+ goto out;
	/*
	- * Use the good data we have in hand to repair damaged children.
	+ * Otherwise, we default to 1 parity device for RAID-Z.
	*/
	- for (c = 0; c < rm->rm_cols; c++) {
	- rc = &rm->rm_col[c];
	- cvd = vd->vdev_child[rc->rc_devidx];
	-
	- if (rc->rc_error == 0)
	- continue;
	-
	- zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
	- rc->rc_offset, rc->rc_abd, rc->rc_size,
	- ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
	- ZIO_FLAG_IO_REPAIR \| (unexpected_errors ?
	- ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
	- }
	+ nparity = 1;
	}
	-}
	-
	-static void
	-vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
	-{
	- if (faulted > vd->vdev_nparity)
	- vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
	- VDEV_AUX_NO_REPLICAS);
	- else if (degraded + faulted != 0)
	- vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
	- else
	- vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
	+ vdrz->vd_nparity = nparity;
	+ return (vdrz);
	+out:
	+ kmem_free(vdrz, sizeof (*vdrz));
	+ return (NULL);
	}

	vdev_ops_t vdev_raidz_ops = {
	Index: sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h
	===================================================================
	--- sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h
	+++ sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h
	@@ -556,6 +556,7 @@
	#define ZPOOL_CONFIG_SPARES "spares"
	#define ZPOOL_CONFIG_IS_SPARE "is_spare"
	#define ZPOOL_CONFIG_NPARITY "nparity"
	+#define ZPOOL_CONFIG_RAIDZ_LOGICAL_WIDTH "raidz_logical_width"
	#define ZPOOL_CONFIG_HOSTID "hostid"
	#define ZPOOL_CONFIG_HOSTNAME "hostname"
	#define ZPOOL_CONFIG_LOADED_TIME "initial_load_time"

File Metadata

Mime Type: text/plain
Expires: Mon, Apr 20, 11:02 AM (2 h, 54 m)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 31834715
Default Alt Text: D15124.diff (103 KB)

D15124.diffNo OneTemporaryActions

D15124.diffView Options

File Metadata

Event Timeline

D15124.diff
No OneTemporary
Actions

D15124.diff
View Options