Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/abd.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/abd.c
+++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/abd.c
@@ -758,6 +758,9 @@
 	int ret = 0;
 	struct abd_iter aiter;
 
+	if (size == 0)
+		return (ret);
+
 	abd_verify(abd);
 	ASSERT3U(off + size, <=, abd->abd_size);
 
@@ -886,6 +889,9 @@
 	int ret = 0;
 	struct abd_iter daiter, saiter;
 
+	if (size == 0)
+		return (ret);
+
 	abd_verify(dabd);
 	abd_verify(sabd);
 
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c
+++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c
@@ -91,7 +91,7 @@
 {
 	reference_t *ref;
 
-	ASSERT(rc->rc_count == number);
+	ASSERT3U(rc->rc_count, ==, number);
 	while (ref = list_head(&rc->rc_list)) {
 		list_remove(&rc->rc_list, ref);
 		kmem_cache_free(reference_cache, ref);
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
+++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
@@ -55,6 +55,7 @@
 #include <sys/vdev_removal.h>
 #include <sys/vdev_indirect_mapping.h>
 #include <sys/vdev_indirect_births.h>
+#include <sys/vdev_raidz.h>
 #include <sys/metaslab.h>
 #include <sys/metaslab_impl.h>
 #include <sys/uberblock_impl.h>
@@ -5923,8 +5924,9 @@
 	vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd;
 	vdev_ops_t *pvops;
 	char *oldvdpath, *newvdpath;
-	int newvd_isspare;
+	int newvd_isspare = B_FALSE;
 	int error;
+	boolean_t raidz = B_FALSE;
 
 	ASSERT(spa_writeable(spa));
 
@@ -5947,10 +5949,16 @@
 	if (oldvd == NULL)
 		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
 
-	if (!oldvd->vdev_ops->vdev_op_leaf)
+	if (oldvd->vdev_ops == &vdev_raidz_ops) {
+		raidz = B_TRUE;
+	} else if (!oldvd->vdev_ops->vdev_op_leaf) {
 		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
+	}
 
-	pvd = oldvd->vdev_parent;
+	if (raidz)
+		pvd = oldvd;
+	else
+		pvd = oldvd->vdev_parent;
 
 	if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0,
 	    VDEV_ALLOC_ATTACH)) != 0)
@@ -5979,6 +5987,7 @@
 		 * vdev.
 		 */
 		if (pvd->vdev_ops != &vdev_mirror_ops &&
+		    pvd->vdev_ops != &vdev_raidz_ops &&
 		    pvd->vdev_ops != &vdev_root_ops)
 			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
 
@@ -6018,7 +6027,8 @@
 	/*
 	 * Make sure the new device is big enough.
 	 */
-	if (newvd->vdev_asize < vdev_get_min_asize(oldvd))
+	vdev_t *min_vdev = raidz ? oldvd->vdev_child[0] : oldvd;
+	if (newvd->vdev_asize < vdev_get_min_asize(min_vdev))
 		return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW));
 
 	/*
@@ -6028,35 +6038,48 @@
 	if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift)
 		return (spa_vdev_exit(spa, newrootvd, txg, EDOM));
 
+	if (raidz) {
+                oldvdpath = kmem_asprintf("raidz%u-%u",
+                    oldvd->vdev_nparity, oldvd->vdev_id);
+	} else {
+                oldvdpath = spa_strdup(oldvd->vdev_path);
+	}
+        newvdpath = spa_strdup(newvd->vdev_path);
+
 	/*
 	 * If this is an in-place replacement, update oldvd's path and devid
 	 * to make it distinguishable from newvd, and unopenable from now on.
 	 */
-	if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) {
+	if (strcmp(oldvdpath, newvdpath) == 0) {
 		spa_strfree(oldvd->vdev_path);
-		oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5,
+		oldvd->vdev_path = kmem_alloc(strlen(newvdpath) + 5,
 		    KM_SLEEP);
-		(void) sprintf(oldvd->vdev_path, "%s/%s",
-		    newvd->vdev_path, "old");
+		(void) sprintf(oldvd->vdev_path, "%s/old",
+		    newvdpath);
 		if (oldvd->vdev_devid != NULL) {
 			spa_strfree(oldvd->vdev_devid);
 			oldvd->vdev_devid = NULL;
 		}
+		spa_strfree(oldvdpath);
+                oldvdpath = spa_strdup(oldvd->vdev_path);
 	}
 
 	/* mark the device being resilvered */
-	newvd->vdev_resilver_txg = txg;
+	if (!raidz)
+		newvd->vdev_resilver_txg = txg;
 
 	/*
 	 * If the parent is not a mirror, or if we're replacing, insert the new
 	 * mirror/replacing/spare vdev above oldvd.
 	 */
-	if (pvd->vdev_ops != pvops)
+	if (!raidz && pvd->vdev_ops != pvops)
 		pvd = vdev_add_parent(oldvd, pvops);
 
 	ASSERT(pvd->vdev_top->vdev_parent == rvd);
+#if 0
 	ASSERT(pvd->vdev_ops == pvops);
 	ASSERT(oldvd->vdev_parent == pvd);
+#endif
 
 	/*
 	 * Extract the new device from its root and add it to pvd.
@@ -6079,29 +6102,34 @@
 	 */
 	dtl_max_txg = txg + TXG_CONCURRENT_STATES;
 
-	vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL,
-	    dtl_max_txg - TXG_INITIAL);
+	if (raidz) {
+		dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
+		dsl_sync_task_nowait(spa->spa_dsl_pool, vdev_raidz_attach_sync,
+		    newvd, 0, ZFS_SPACE_CHECK_EXTRA_RESERVED, tx);
+		dmu_tx_commit(tx);
+	} else {
+                vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL,
+                    dtl_max_txg - TXG_INITIAL);
 
-	if (newvd->vdev_isspare) {
-		spa_spare_activate(newvd);
-		spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_SPARE);
-	}
+                if (newvd->vdev_isspare) {
+                        spa_spare_activate(newvd);
+                        spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_SPARE);
+                }
 
-	oldvdpath = spa_strdup(oldvd->vdev_path);
-	newvdpath = spa_strdup(newvd->vdev_path);
-	newvd_isspare = newvd->vdev_isspare;
+                newvd_isspare = newvd->vdev_isspare;
 
-	/*
-	 * Mark newvd's DTL dirty in this txg.
-	 */
-	vdev_dirty(tvd, VDD_DTL, newvd, txg);
+                /*
+                 * Mark newvd's DTL dirty in this txg.
+                 */
+                vdev_dirty(tvd, VDD_DTL, newvd, txg);
 
-	/*
-	 * Schedule the resilver to restart in the future. We do this to
-	 * ensure that dmu_sync-ed blocks have been stitched into the
-	 * respective datasets.
-	 */
-	dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg);
+                /*
+                 * Schedule the resilver to restart in the future. We do this to
+                 * ensure that dmu_sync-ed blocks have been stitched into the
+                 * respective datasets.
+                 */
+                dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg);
+	}
 
 	if (spa->spa_bootfs)
 		spa_event_notify(spa, newvd, NULL, ESC_ZFS_BOOTFS_VDEV_ATTACH);
@@ -6113,6 +6141,10 @@
 	 */
 	(void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0);
 
+	if (raidz) {
+		error = vdev_online(spa, guid, ZFS_ONLINE_EXPAND, NULL);
+	}
+
 	spa_history_log_internal(spa, "vdev attach", NULL,
 	    "%s vdev=%s %s vdev=%s",
 	    replacing && newvd_isspare ? "spare in" :
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_raidz.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_raidz.h
+++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_raidz.h
@@ -39,10 +39,18 @@
 extern "C" {
 #endif
 
-#ifdef _KERNEL
+typedef struct vdev_raidz {
+	int vd_logical_width;
+	int vd_physical_width;
+	int vd_nparity;
+	boolean_t vn_expanding;
+} vdev_raidz_t;
+
 extern int vdev_raidz_physio(vdev_t *,
     caddr_t, size_t, uint64_t, uint64_t, boolean_t, boolean_t);
-#endif
+extern void vdev_raidz_attach_sync(void *, dmu_tx_t *);
+extern void vdev_raidz_config_generate(vdev_t *, nvlist_t *);
+extern void *vdev_raidz_get_tsd(spa_t *, nvlist_t *);
 #ifdef	__cplusplus
 }
 #endif
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
+++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
@@ -49,6 +49,7 @@
 #include <sys/arc.h>
 #include <sys/zil.h>
 #include <sys/dsl_scan.h>
+#include <sys/vdev_raidz.h>
 #include <sys/abd.h>
 #include <sys/trim_map.h>
 
@@ -584,7 +585,7 @@
 {
 	vdev_ops_t *ops;
 	char *type;
-	uint64_t guid = 0, islog, nparity;
+	uint64_t guid = 0, islog;
 	vdev_t *vd;
 	vdev_indirect_config_t *vic;
 
@@ -637,47 +638,21 @@
 	if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES)
 		return (SET_ERROR(ENOTSUP));
 
-	/*
-	 * Set the nparity property for RAID-Z vdevs.
-	 */
-	nparity = -1ULL;
+	void *tsd = NULL;
+	int nparity = 0;
 	if (ops == &vdev_raidz_ops) {
-		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY,
-		    &nparity) == 0) {
-			if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY)
-				return (SET_ERROR(EINVAL));
-			/*
-			 * Previous versions could only support 1 or 2 parity
-			 * device.
-			 */
-			if (nparity > 1 &&
-			    spa_version(spa) < SPA_VERSION_RAIDZ2)
-				return (SET_ERROR(ENOTSUP));
-			if (nparity > 2 &&
-			    spa_version(spa) < SPA_VERSION_RAIDZ3)
-				return (SET_ERROR(ENOTSUP));
-		} else {
-			/*
-			 * We require the parity to be specified for SPAs that
-			 * support multiple parity levels.
-			 */
-			if (spa_version(spa) >= SPA_VERSION_RAIDZ2)
-				return (SET_ERROR(EINVAL));
-			/*
-			 * Otherwise, we default to 1 parity device for RAID-Z.
-			 */
-			nparity = 1;
-		}
-	} else {
-		nparity = 0;
+		vdev_raidz_t *rz = tsd = vdev_raidz_get_tsd(spa, nv);
+		if (rz == NULL)
+			return (SET_ERROR(EINVAL));
+		nparity = rz->vd_nparity;
 	}
-	ASSERT(nparity != -1ULL);
 
 	vd = vdev_alloc_common(spa, id, guid, ops);
 	vic = &vd->vdev_indirect_config;
 
 	vd->vdev_islog = islog;
 	vd->vdev_nparity = nparity;
+	vd->vdev_tsd = tsd;
 
 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0)
 		vd->vdev_path = spa_strdup(vd->vdev_path);
@@ -849,6 +824,11 @@
 	ASSERT(vd->vdev_child == NULL);
 	ASSERT(vd->vdev_guid_sum == vd->vdev_guid);
 
+	if (vd->vdev_ops == &vdev_raidz_ops) {
+		vdev_raidz_t *rz = vd->vdev_tsd;
+		kmem_free(rz, sizeof(*rz));
+	}
+
 	/*
 	 * Discard allocation state.
 	 */
@@ -3155,8 +3135,10 @@
 	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
 		return (spa_vdev_state_exit(spa, NULL, ENODEV));
 
+#if 0
 	if (!vd->vdev_ops->vdev_op_leaf)
 		return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
+#endif
 
 	wasoffline = (vd->vdev_offline || vd->vdev_tmpoffline);
 	oldstate = vd->vdev_state;
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect.c
+++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect.c
@@ -1078,6 +1078,13 @@
 	if (vd->vdev_ops == &vdev_indirect_ops)
 		return;
 
+	printf("vdev_indirect_io_start_cb: src=%llx split_offset=%x dst: vd=%u off=%llx size=%x\n",
+		(long long)zio->io_offset,
+		(int)split_offset,
+		(int)vd->vdev_id,
+		(long long)offset,
+		(int)size);
+
 	zio_nowait(zio_vdev_child_io(zio, NULL, vd, offset,
 	    abd_get_offset(zio->io_abd, split_offset),
 	    size, zio->io_type, zio->io_priority,
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c
+++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c
@@ -141,6 +141,7 @@
 #include <sys/zap.h>
 #include <sys/vdev.h>
 #include <sys/vdev_impl.h>
+#include <sys/vdev_raidz.h>
 #include <sys/uberblock_impl.h>
 #include <sys/metaslab.h>
 #include <sys/metaslab_impl.h>
@@ -276,31 +277,13 @@
 	if (vd->vdev_fru != NULL)
 		fnvlist_add_string(nv, ZPOOL_CONFIG_FRU, vd->vdev_fru);
 
-	if (vd->vdev_nparity != 0) {
-		ASSERT(strcmp(vd->vdev_ops->vdev_op_type,
-		    VDEV_TYPE_RAIDZ) == 0);
+	if (vd->vdev_ops == &vdev_raidz_ops)
+		vdev_raidz_config_generate(vd, nv);
 
-		/*
-		 * Make sure someone hasn't managed to sneak a fancy new vdev
-		 * into a crufty old storage pool.
-		 */
-		ASSERT(vd->vdev_nparity == 1 ||
-		    (vd->vdev_nparity <= 2 &&
-		    spa_version(spa) >= SPA_VERSION_RAIDZ2) ||
-		    (vd->vdev_nparity <= 3 &&
-		    spa_version(spa) >= SPA_VERSION_RAIDZ3));
-
-		/*
-		 * Note that we'll add the nparity tag even on storage pools
-		 * that only support a single parity device -- older software
-		 * will just ignore it.
-		 */
-		fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vd->vdev_nparity);
-	}
-
-	if (vd->vdev_wholedisk != -1ULL)
+	if (vd->vdev_wholedisk != -1ULL) {
 		fnvlist_add_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
 		    vd->vdev_wholedisk);
+	}
 
 	if (vd->vdev_not_present && !(flags & VDEV_CONFIG_MISSING))
 		fnvlist_add_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 1);
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c
+++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c
@@ -28,12 +28,14 @@
 
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
+#include <sys/zap.h>
 #include <sys/vdev_impl.h>
 #ifdef illumos
 #include <sys/vdev_disk.h>
 #endif
 #include <sys/vdev_file.h>
 #include <sys/vdev_raidz.h>
+#include <sys/metaslab_impl.h>
 #include <sys/zio.h>
 #include <sys/zio_checksum.h>
 #include <sys/abd.h>
@@ -41,6 +43,12 @@
 #include <sys/fm/fs/zfs.h>
 #include <sys/bio.h>
 
+#if 0
+#ifdef ZFS_DEBUG
+#include <sys/vdev_initialize.h>	/* vdev_xlate testing */
+#endif
+#endif
+
 /*
  * Virtual device vector for RAID-Z.
  *
@@ -113,27 +121,31 @@
 	uint64_t rc_offset;		/* device offset */
 	uint64_t rc_size;		/* I/O size */
 	abd_t *rc_abd;			/* I/O data */
+	void *rc_orig_data;		/* pre-reconstruction */
 	void *rc_gdata;			/* used to store the "good" version */
 	int rc_error;			/* I/O error for this device */
 	uint8_t rc_tried;		/* Did we attempt this I/O column? */
 	uint8_t rc_skipped;		/* Did we skip this I/O column? */
+	uint8_t rc_need_orig_restore;	/* need to restore from orig_data? */
 } raidz_col_t;
 
+typedef struct raidz_row {
+	uint64_t rr_cols;		/* Regular column count */
+	uint64_t rr_missingdata;	/* Count of missing data devices */
+	uint64_t rr_missingparity;	/* Count of missing parity devices */
+	uint64_t rr_firstdatacol;	/* First data column/parity count */
+	abd_t *rr_abd_copy;		/* rm_asize-buffer of copied data */
+	int rr_code;			/* reconstruction code */
+	raidz_col_t rr_col[0];		/* Flexible array of I/O columns */
+} raidz_row_t;
+
 typedef struct raidz_map {
-	uint64_t rm_cols;		/* Regular column count */
-	uint64_t rm_scols;		/* Count including skipped columns */
-	uint64_t rm_bigcols;		/* Number of oversized columns */
-	uint64_t rm_asize;		/* Actual total I/O size */
-	uint64_t rm_missingdata;	/* Count of missing data devices */
-	uint64_t rm_missingparity;	/* Count of missing parity devices */
-	uint64_t rm_firstdatacol;	/* First data column/parity count */
-	uint64_t rm_nskip;		/* Skipped sectors for padding */
-	uint64_t rm_skipstart;		/* Column index of padding start */
-	abd_t *rm_abd_copy;		/* rm_asize-buffer of copied data */
 	uintptr_t rm_reports;		/* # of referencing checksum reports */
-	uint8_t	rm_freed;		/* map no longer has referencing ZIO */
-	uint8_t	rm_ecksuminjected;	/* checksum error was injected */
-	raidz_col_t rm_col[1];		/* Flexible array of I/O columns */
+	boolean_t rm_freed;		/* map no longer has referencing ZIO */
+	boolean_t rm_ecksuminjected;	/* checksum error was injected */
+	int rm_nrows;
+	int rm_nskip;			/* Sectors skipped for padding */
+	raidz_row_t *rm_row[0];		/* flexible array of rows */
 } raidz_map_t;
 
 #define	VDEV_RAIDZ_P		0
@@ -241,7 +253,7 @@
 	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
 };
 
-static void vdev_raidz_generate_parity(raidz_map_t *rm);
+static void vdev_raidz_generate_parity(raidz_row_t *);
 
 /*
  * Multiply a given number by 2 raised to the given power.
@@ -263,31 +275,46 @@
 }
 
 static void
-vdev_raidz_map_free(raidz_map_t *rm)
+vdev_raidz_row_free(raidz_row_t *rr)
 {
 	int c;
-	size_t size;
 
-	for (c = 0; c < rm->rm_firstdatacol; c++) {
-		if (rm->rm_col[c].rc_abd != NULL)
-			abd_free(rm->rm_col[c].rc_abd);
+	for (c = 0; c < rr->rr_firstdatacol && c < rr->rr_cols; c++) {
+		if (rr->rr_col[c].rc_abd != NULL)
+			abd_free(rr->rr_col[c].rc_abd);
 
-		if (rm->rm_col[c].rc_gdata != NULL)
-			zio_buf_free(rm->rm_col[c].rc_gdata,
-			    rm->rm_col[c].rc_size);
+		if (rr->rr_col[c].rc_gdata != NULL) {
+			zio_buf_free(rr->rr_col[c].rc_gdata,
+			    rr->rr_col[c].rc_size);
+		}
+		if (rr->rr_col[c].rc_orig_data != NULL) {
+			zio_buf_free(rr->rr_col[c].rc_orig_data,
+			    rr->rr_col[c].rc_size);
+		}
 	}
 
-	size = 0;
-	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
-		if (rm->rm_col[c].rc_abd != NULL)
-			abd_put(rm->rm_col[c].rc_abd);
-		size += rm->rm_col[c].rc_size;
+	for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
+		if (rr->rr_col[c].rc_abd != NULL)
+			abd_put(rr->rr_col[c].rc_abd);
+		if (rr->rr_col[c].rc_orig_data != NULL) {
+			zio_buf_free(rr->rr_col[c].rc_orig_data,
+			    rr->rr_col[c].rc_size);
+		}
 	}
 
-	if (rm->rm_abd_copy != NULL)
-		abd_free(rm->rm_abd_copy);
+	if (rr->rr_abd_copy != NULL)
+		abd_free(rr->rr_abd_copy);
 
-	kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
+	kmem_free(rr, offsetof(raidz_row_t, rr_col[rr->rr_cols]));
+}
+
+static void
+vdev_raidz_map_free(raidz_map_t *rm)
+{
+        for (int i = 0; i < rm->rm_nrows; i++) {
+                vdev_raidz_row_free(rm->rm_row[i]);
+        }
+	kmem_free(rm, offsetof(raidz_map_t, rm_row[rm->rm_nrows]));
 }
 
 static void
@@ -296,10 +323,11 @@
 	raidz_map_t *rm = zio->io_vsd;
 
 	ASSERT0(rm->rm_freed);
-	rm->rm_freed = 1;
+	rm->rm_freed = B_TRUE;
 
-	if (rm->rm_reports == 0)
+	if (rm->rm_reports == 0) {
 		vdev_raidz_map_free(rm);
+	}
 }
 
 /*ARGSUSED*/
@@ -310,7 +338,7 @@
 
 	ASSERT3U(rm->rm_reports, >, 0);
 
-	if (--rm->rm_reports == 0 && rm->rm_freed != 0)
+	if (--rm->rm_reports == 0 && rm->rm_freed)
 		vdev_raidz_map_free(rm);
 }
 
@@ -324,18 +352,22 @@
 	const char *good = NULL;
 	char *bad;
 
+	zfs_dbgmsg("checksum error on rm=%p", rm);
+
 	if (good_data == NULL) {
 		zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
 		return;
 	}
 
-	if (c < rm->rm_firstdatacol) {
+	zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
+#if 0
+	if (c < rm->rr_firstdatacol) {
 		/*
 		 * The first time through, calculate the parity blocks for
 		 * the good data (this relies on the fact that the good
 		 * data never changes for a given logical ZIO)
 		 */
-		if (rm->rm_col[0].rc_gdata == NULL) {
+		if (rm->rr_col[0].rc_gdata == NULL) {
 			abd_t *bad_parity[VDEV_RAIDZ_MAXPARITY];
 			char *buf;
 			int offset;
@@ -345,22 +377,22 @@
 			 * good_data, first saving the parity bufs and
 			 * replacing them with buffers to hold the result.
 			 */
-			for (x = 0; x < rm->rm_firstdatacol; x++) {
-				bad_parity[x] = rm->rm_col[x].rc_abd;
-				rm->rm_col[x].rc_gdata =
-				    zio_buf_alloc(rm->rm_col[x].rc_size);
-				rm->rm_col[x].rc_abd =
-				    abd_get_from_buf(rm->rm_col[x].rc_gdata,
-				    rm->rm_col[x].rc_size);
+			for (x = 0; x < rm->rr_firstdatacol; x++) {
+				bad_parity[x] = rm->rr_col[x].rc_abd;
+				rm->rr_col[x].rc_gdata =
+				    zio_buf_alloc(rm->rr_col[x].rc_size);
+				rm->rr_col[x].rc_abd =
+				    abd_get_from_buf(rm->rr_col[x].rc_gdata,
+				    rm->rr_col[x].rc_size);
 			}
 
 			/* fill in the data columns from good_data */
 			buf = (char *)good_data;
-			for (; x < rm->rm_cols; x++) {
-				abd_put(rm->rm_col[x].rc_abd);
-				rm->rm_col[x].rc_abd = abd_get_from_buf(buf,
-				    rm->rm_col[x].rc_size);
-				buf += rm->rm_col[x].rc_size;
+			for (; x < rm->rr_cols; x++) {
+				abd_put(rm->rr_col[x].rc_abd);
+				rm->rr_col[x].rc_abd = abd_get_from_buf(buf,
+				    rm->rr_col[x].rc_size);
+				buf += rm->rr_col[x].rc_size;
 			}
 
 			/*
@@ -369,34 +401,35 @@
 			vdev_raidz_generate_parity(rm);
 
 			/* restore everything back to its original state */
-			for (x = 0; x < rm->rm_firstdatacol; x++) {
-				abd_put(rm->rm_col[x].rc_abd);
-				rm->rm_col[x].rc_abd = bad_parity[x];
+			for (x = 0; x < rm->rr_firstdatacol; x++) {
+				abd_put(rm->rr_col[x].rc_abd);
+				rm->rr_col[x].rc_abd = bad_parity[x];
 			}
 
 			offset = 0;
-			for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
-				abd_put(rm->rm_col[x].rc_abd);
-				rm->rm_col[x].rc_abd = abd_get_offset(
-				    rm->rm_abd_copy, offset);
-				offset += rm->rm_col[x].rc_size;
+			for (x = rm->rr_firstdatacol; x < rm->rr_cols; x++) {
+				abd_put(rm->rr_col[x].rc_abd);
+				rm->rr_col[x].rc_abd = abd_get_offset(
+				    rm->rr_abd_copy, offset);
+				offset += rm->rr_col[x].rc_size;
 			}
 		}
 
-		ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
-		good = rm->rm_col[c].rc_gdata;
+		ASSERT3P(rm->rr_col[c].rc_gdata, !=, NULL);
+		good = rm->rr_col[c].rc_gdata;
 	} else {
 		/* adjust good_data to point at the start of our column */
 		good = good_data;
 
-		for (x = rm->rm_firstdatacol; x < c; x++)
-			good += rm->rm_col[x].rc_size;
+		for (x = rm->rr_firstdatacol; x < c; x++)
+			good += rm->rr_col[x].rc_size;
 	}
 
-	bad = abd_borrow_buf_copy(rm->rm_col[c].rc_abd, rm->rm_col[c].rc_size);
+	bad = abd_borrow_buf_copy(rm->rr_col[c].rc_abd, rm->rr_col[c].rc_size);
 	/* we drop the ereport if it ends up that the data was good */
 	zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
-	abd_return_buf(rm->rm_col[c].rc_abd, bad, rm->rm_col[c].rc_size);
+	abd_return_buf(rm->rr_col[c].rc_abd, bad, rm->rr_col[c].rc_size);
+#endif
 }
 
 /*
@@ -409,10 +442,7 @@
 vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
 {
 	size_t c = (size_t)(uintptr_t)arg;
-	size_t offset;
-
 	raidz_map_t *rm = zio->io_vsd;
-	size_t size;
 
 	/* set up the report and bump the refcount  */
 	zcr->zcr_cbdata = rm;
@@ -423,7 +453,7 @@
 	rm->rm_reports++;
 	ASSERT3U(rm->rm_reports, >, 0);
 
-	if (rm->rm_abd_copy != NULL)
+	if (rm->rm_row[0]->rr_abd_copy != NULL)
 		return;
 
 	/*
@@ -435,24 +465,33 @@
 	 * to copy them.
 	 */
 
-	size = 0;
-	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
-		size += rm->rm_col[c].rc_size;
+	for (int i = 0; i < rm->rm_nrows; i++) {
+		raidz_row_t *rr = rm->rm_row[i];
+		size_t offset;
+		size_t size = 0;
+		for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++)
+			size += rr->rr_col[c].rc_size;
+
+		rr->rr_abd_copy =
+		    abd_alloc_sametype(rr->rr_col[rr->rr_firstdatacol].rc_abd,
+		    size);
 
-	rm->rm_abd_copy =
-	    abd_alloc_sametype(rm->rm_col[rm->rm_firstdatacol].rc_abd, size);
+		for (offset = 0, c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
+			raidz_col_t *col = &rr->rr_col[c];
+
+			if (col->rc_size == 0)
+				continue;
 
-	for (offset = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
-		raidz_col_t *col = &rm->rm_col[c];
-		abd_t *tmp = abd_get_offset(rm->rm_abd_copy, offset);
+			abd_t *tmp = abd_get_offset(rr->rr_abd_copy, offset);
 
-		abd_copy(tmp, col->rc_abd, col->rc_size);
-		abd_put(col->rc_abd);
-		col->rc_abd = tmp;
+			abd_copy(tmp, col->rc_abd, col->rc_size);
+			abd_put(col->rc_abd);
+			col->rc_abd = tmp;
 
-		offset += col->rc_size;
+			offset += col->rc_size;
+		}
+		ASSERT3U(offset, ==, size);
 	}
-	ASSERT3U(offset, ==, size);
 }
 
 static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
@@ -468,7 +507,7 @@
 vdev_raidz_map_alloc(abd_t *abd, uint64_t size, uint64_t offset, boolean_t dofree,
     uint64_t unit_shift, uint64_t dcols, uint64_t nparity)
 {
-	raidz_map_t *rm;
+	raidz_row_t *rr;
 	/* The starting RAIDZ (parent) vdev sector of the block. */
 	uint64_t b = offset >> unit_shift;
 	/* The zio's size in units of the vdev's minimum sector size. */
@@ -477,9 +516,13 @@
 	uint64_t f = b % dcols;
 	/* The starting byte offset on each child vdev. */
 	uint64_t o = (b / dcols) << unit_shift;
-	uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
+	uint64_t q, r, c, bc, col, acols, coff, devidx, asize, tot;
 	uint64_t off = 0;
 
+	raidz_map_t *rm =
+	    kmem_zalloc(offsetof(raidz_map_t, rm_row[1]), KM_SLEEP);
+	rm->rm_nrows = 1;
+
 	/*
 	 * "Quotient": The number of data sectors for this stripe on all but
 	 * the "big column" child vdevs that also contain "remainder" data.
@@ -502,77 +545,63 @@
 	tot = s + nparity * (q + (r == 0 ? 0 : 1));
 
 	/* acols: The columns that will be accessed. */
-	/* scols: The columns that will be accessed or skipped. */
 	if (q == 0) {
 		/* Our I/O request doesn't span all child vdevs. */
 		acols = bc;
-		scols = MIN(dcols, roundup(bc, nparity + 1));
 	} else {
 		acols = dcols;
-		scols = dcols;
 	}
 
-	ASSERT3U(acols, <=, scols);
-
-	rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
+	rr = kmem_alloc(offsetof(raidz_row_t, rr_col[acols]), KM_SLEEP);
+	rm->rm_row[0] = rr;
 
-	rm->rm_cols = acols;
-	rm->rm_scols = scols;
-	rm->rm_bigcols = bc;
-	rm->rm_skipstart = bc;
-	rm->rm_missingdata = 0;
-	rm->rm_missingparity = 0;
-	rm->rm_firstdatacol = nparity;
-	rm->rm_abd_copy = NULL;
-	rm->rm_reports = 0;
-	rm->rm_freed = 0;
-	rm->rm_ecksuminjected = 0;
+	rr->rr_cols = acols;
+	rr->rr_missingdata = 0;
+	rr->rr_missingparity = 0;
+	rr->rr_firstdatacol = nparity;
+	rr->rr_abd_copy = NULL;
 
 	asize = 0;
 
-	for (c = 0; c < scols; c++) {
+	for (c = 0; c < acols; c++) {
 		col = f + c;
 		coff = o;
 		if (col >= dcols) {
 			col -= dcols;
 			coff += 1ULL << unit_shift;
 		}
-		rm->rm_col[c].rc_devidx = col;
-		rm->rm_col[c].rc_offset = coff;
-		rm->rm_col[c].rc_abd = NULL;
-		rm->rm_col[c].rc_gdata = NULL;
-		rm->rm_col[c].rc_error = 0;
-		rm->rm_col[c].rc_tried = 0;
-		rm->rm_col[c].rc_skipped = 0;
-
-		if (c >= acols)
-			rm->rm_col[c].rc_size = 0;
-		else if (c < bc)
-			rm->rm_col[c].rc_size = (q + 1) << unit_shift;
+		rr->rr_col[c].rc_devidx = col;
+		rr->rr_col[c].rc_offset = coff;
+		rr->rr_col[c].rc_abd = NULL;
+		rr->rr_col[c].rc_gdata = NULL;
+		rr->rr_col[c].rc_orig_data = NULL;
+		rr->rr_col[c].rc_error = 0;
+		rr->rr_col[c].rc_tried = 0;
+		rr->rr_col[c].rc_skipped = 0;
+		rr->rr_col[c].rc_need_orig_restore = B_FALSE;
+
+		if (c < bc)
+			rr->rr_col[c].rc_size = (q + 1) << unit_shift;
 		else
-			rm->rm_col[c].rc_size = q << unit_shift;
+			rr->rr_col[c].rc_size = q << unit_shift;
 
-		asize += rm->rm_col[c].rc_size;
+		asize += rr->rr_col[c].rc_size;
 	}
 
 	ASSERT3U(asize, ==, tot << unit_shift);
-	rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
 	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
-	ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
-	ASSERT3U(rm->rm_nskip, <=, nparity);
 
 	if (!dofree) {
-		for (c = 0; c < rm->rm_firstdatacol; c++) {
-			rm->rm_col[c].rc_abd =
-			    abd_alloc_linear(rm->rm_col[c].rc_size, B_TRUE);
-		}
+		for (c = 0; c < rr->rr_firstdatacol; c++)
+			rr->rr_col[c].rc_abd =
+			    abd_alloc_linear(rr->rr_col[c].rc_size, B_TRUE);
 
-		rm->rm_col[c].rc_abd = abd_get_offset(abd, 0);
-		off = rm->rm_col[c].rc_size;
+		rr->rr_col[c].rc_abd = abd_get_offset(abd, 0);
+		off = rr->rr_col[c].rc_size;
 
 		for (c = c + 1; c < acols; c++) {
-			rm->rm_col[c].rc_abd = abd_get_offset(abd, off);
-			off += rm->rm_col[c].rc_size;
+			rr->rr_col[c].rc_abd = abd_get_offset(abd, off);
+			off += rr->rr_col[c].rc_size;
 		}
 	}
 
@@ -596,20 +625,182 @@
 	 * skip the first column since at least one data and one parity
 	 * column must appear in each row.
 	 */
-	ASSERT(rm->rm_cols >= 2);
-	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
+	ASSERT(rr->rr_cols >= 2);
+	ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size);
+
+	if (rr->rr_firstdatacol == 1 && (offset & (1ULL << 20))) {
+		devidx = rr->rr_col[0].rc_devidx;
+		o = rr->rr_col[0].rc_offset;
+		rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx;
+		rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset;
+		rr->rr_col[1].rc_devidx = devidx;
+		rr->rr_col[1].rc_offset = o;
+	}
+
+	return (rm);
+}
+
+static raidz_map_t *
+vdev_raidz_map_alloc_expanded(abd_t *abd, uint64_t size, uint64_t offset, boolean_t dofree,
+    uint64_t ashift, uint64_t physical_cols, uint64_t logical_cols,
+    uint64_t nparity)
+{
+	/* The starting RAIDZ (parent) vdev sector of the block. */
+	uint64_t b = offset >> ashift;
+	/* The zio's size in units of the vdev's minimum sector size. */
+	uint64_t s = size >> ashift;
+	uint64_t cur_col = b % physical_cols;
+	/* The starting byte offset on each child vdev. */
+	uint64_t child_offset = (b / physical_cols) << ashift;
+	uint64_t q, r, bc, devidx, asize, tot;
+
+	/*
+	 * "Quotient": The number of data sectors for this stripe on all but
+	 * the "big column" child vdevs that also contain "remainder" data.
+	 * AKA "full rows"
+	 */
+	q = s / (logical_cols - nparity);
+
+	/*
+	 * "Remainder": The number of partial stripe data sectors in this I/O.
+	 * This will add a sector to some, but not all, child vdevs.
+	 */
+	r = s - q * (logical_cols - nparity);
+
+	/* The number of "big columns" - those which contain remainder data. */
+	bc = (r == 0 ? 0 : r + nparity);
+
+	/*
+	 * The total number of data and parity sectors associated with
+	 * this I/O.
+	 */
+	tot = s + nparity * (q + (r == 0 ? 0 : 1));
+
+	/* How many rows contain data (not skip) */
+	uint64_t rows = howmany(tot, logical_cols);
+	int cols = MIN(tot, logical_cols);
+
+	raidz_map_t *rm =
+	    kmem_zalloc(offsetof(raidz_map_t, rm_row[rows]),
+	    KM_SLEEP);
+	rm->rm_nrows = rows;
+	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
+	asize = 0;
 
-	if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
-		devidx = rm->rm_col[0].rc_devidx;
-		o = rm->rm_col[0].rc_offset;
-		rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
-		rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
-		rm->rm_col[1].rc_devidx = devidx;
-		rm->rm_col[1].rc_offset = o;
+	zfs_dbgmsg("rm=%p s=%d q=%d r=%d bc=%d nrows=%d cols=%d",
+	    rm, (int)s, (int)q, (int)r, (int)bc, (int)rows, (int)cols);
+
+	for (uint64_t row = 0; row < rows; row++) {
+		raidz_row_t *rr = kmem_alloc(offsetof(raidz_row_t,
+		    rr_col[cols]), KM_SLEEP);
+		rm->rm_row[row] = rr;
+
+		/*
+		 * We set cols to the entire width of the block, even
+		 * if this row is shorter.  This is needed because parity
+		 * generation (for Q and R) needs to know the entire width,
+		 * because it treats the short row as though it was
+		 * full-width (and the "phantom" sectors were zero-filled).
+		 *
+		 * Another approach to this would be to set cols shorter
+		 * (to just the number of columns that we might do i/o to)
+		 * and have another mechanism to tell the parity generation
+		 * about the "entire width".  Reconstruction (at least
+		 * vdev_raidz_reconstruct_general()) would also need to
+		 * know about the "entire width".
+		 */
+		rr->rr_cols = cols;
+		rr->rr_missingdata = 0;
+		rr->rr_missingparity = 0;
+		rr->rr_firstdatacol = nparity;
+		rr->rr_abd_copy = NULL;
+
+		for (int c = 0; c < rr->rr_cols; c++, cur_col++) {
+			if (cur_col >= physical_cols) {
+				cur_col -= physical_cols;
+				child_offset += 1ULL << ashift;
+			}
+			rr->rr_col[c].rc_devidx = cur_col;
+			rr->rr_col[c].rc_offset = child_offset;
+			rr->rr_col[c].rc_gdata = NULL;
+			rr->rr_col[c].rc_orig_data = NULL;
+			rr->rr_col[c].rc_error = 0;
+			rr->rr_col[c].rc_tried = 0;
+			rr->rr_col[c].rc_skipped = 0;
+			rr->rr_col[c].rc_abd = NULL;
+			rr->rr_col[c].rc_need_orig_restore = B_FALSE;
+
+			uint64_t dc = c - rr->rr_firstdatacol;
+			if (c < rr->rr_firstdatacol) {
+				rr->rr_col[c].rc_size = 1ULL << ashift;
+				if (!dofree) {
+					rr->rr_col[c].rc_abd =
+					    abd_alloc_linear(rr->rr_col[c].rc_size,
+					    B_TRUE);
+				}
+			} else if (row == rows - 1 && bc != 0 && c >= bc) {
+				/*
+				 * Past the end, this for parity generation.
+				 */
+				rr->rr_col[c].rc_size = 0;
+				rr->rr_col[c].rc_abd = NULL;
+			} else {
+				/* XXX ASCII art diagram here */
+				/* "data column" (col excluding parity) */
+				uint64_t off;
+
+				if (c < bc || r == 0) {
+					off = dc * rows + row;
+				} else {
+					off = r * rows +
+					    (dc - r) * (rows - 1) + row;
+				}
+				zfs_dbgmsg("rm=%p row=%d c=%d dc=%d off=%u devidx=%u",
+				    rm, (int)row, (int)c, (int)dc, (int)off, (int)cur_col);
+				rr->rr_col[c].rc_size = 1ULL << ashift;
+				if (!dofree) {
+					rr->rr_col[c].rc_abd =
+					    abd_get_offset(abd, off << ashift);
+				}
+			}
+
+			asize += rr->rr_col[c].rc_size;
+		}
+
+		/*
+		 * If all data stored spans all columns, there's a danger that parity
+		 * will always be on the same device and, since parity isn't read
+		 * during normal operation, that that device's I/O bandwidth won't be
+		 * used effectively. We therefore switch the parity every 1MB.
+		 *
+		 * ... at least that was, ostensibly, the theory. As a practical
+		 * matter unless we juggle the parity between all devices evenly, we
+		 * won't see any benefit. Further, occasional writes that aren't a
+		 * multiple of the LCM of the number of children and the minimum
+		 * stripe width are sufficient to avoid pessimal behavior.
+		 * Unfortunately, this decision created an implicit on-disk format
+		 * requirement that we need to support for all eternity, but only
+		 * for single-parity RAID-Z.
+		 *
+		 * If we intend to skip a sector in the zeroth column for padding
+		 * we must make sure to note this swap. We will never intend to
+		 * skip the first column since at least one data and one parity
+		 * column must appear in each row.
+		 */
+		if (rr->rr_firstdatacol == 1 && rr->rr_cols > 1 &&
+		    (offset & (1ULL << 20))) {
+			ASSERT(rr->rr_cols >= 2);
+			ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size);
+			devidx = rr->rr_col[0].rc_devidx;
+			uint64_t o = rr->rr_col[0].rc_offset;
+			rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx;
+			rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset;
+			rr->rr_col[1].rc_devidx = devidx;
+			rr->rr_col[1].rc_offset = o;
+		}
 
-		if (rm->rm_skipstart == 0)
-			rm->rm_skipstart = 1;
 	}
+	ASSERT3U(asize, ==, tot << ashift);
 
 	return (rm);
 }
@@ -676,55 +867,48 @@
 }
 
 static void
-vdev_raidz_generate_parity_p(raidz_map_t *rm)
+vdev_raidz_generate_parity_p(raidz_row_t *rr)
 {
-	uint64_t *p;
-	int c;
-	abd_t *src;
+	uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
 
-	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
-		src = rm->rm_col[c].rc_abd;
-		p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
+	for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
+		abd_t *src = rr->rr_col[c].rc_abd;
 
-		if (c == rm->rm_firstdatacol) {
-			abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
+		if (c == rr->rr_firstdatacol) {
+			abd_copy_to_buf(p, src, rr->rr_col[c].rc_size);
 		} else {
 			struct pqr_struct pqr = { p, NULL, NULL };
-			(void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
+			(void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size,
 			    vdev_raidz_p_func, &pqr);
 		}
 	}
 }
 
 static void
-vdev_raidz_generate_parity_pq(raidz_map_t *rm)
+vdev_raidz_generate_parity_pq(raidz_row_t *rr)
 {
-	uint64_t *p, *q, pcnt, ccnt, mask, i;
-	int c;
-	abd_t *src;
+	uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
+	uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
+	uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
+	ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size ==
+	    rr->rr_col[VDEV_RAIDZ_Q].rc_size);
 
-	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
-	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
-	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
+	for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
+		abd_t *src = rr->rr_col[c].rc_abd;
 
-	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
-		src = rm->rm_col[c].rc_abd;
-		p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
-		q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
+		uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]);
 
-		ccnt = rm->rm_col[c].rc_size / sizeof (p[0]);
-
-		if (c == rm->rm_firstdatacol) {
-			abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
-			(void) memcpy(q, p, rm->rm_col[c].rc_size);
+		if (c == rr->rr_firstdatacol) {
+			abd_copy_to_buf(p, src, rr->rr_col[c].rc_size);
+			(void) memcpy(q, p, rr->rr_col[c].rc_size);
 		} else {
 			struct pqr_struct pqr = { p, q, NULL };
-			(void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
+			(void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size,
 			    vdev_raidz_pq_func, &pqr);
 		}
 
-		if (c == rm->rm_firstdatacol) {
-			for (i = ccnt; i < pcnt; i++) {
+		if (c == rr->rr_firstdatacol) {
+			for (uint64_t i = ccnt; i < pcnt; i++) {
 				p[i] = 0;
 				q[i] = 0;
 			}
@@ -733,7 +917,8 @@
 			 * Treat short columns as though they are full of 0s.
 			 * Note that there's therefore nothing needed for P.
 			 */
-			for (i = ccnt; i < pcnt; i++) {
+			uint64_t mask;
+			for (uint64_t i = ccnt; i < pcnt; i++) {
 				VDEV_RAIDZ_64MUL_2(q[i], mask);
 			}
 		}
@@ -741,38 +926,35 @@
 }
 
 static void
-vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
+vdev_raidz_generate_parity_pqr(raidz_row_t *rr)
 {
-	uint64_t *p, *q, *r, pcnt, ccnt, mask, i;
-	int c;
-	abd_t *src;
-
-	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
-	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
-	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
-	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
-	    rm->rm_col[VDEV_RAIDZ_R].rc_size);
-
-	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
-		src = rm->rm_col[c].rc_abd;
-		p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
-		q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
-		r = abd_to_buf(rm->rm_col[VDEV_RAIDZ_R].rc_abd);
-
-		ccnt = rm->rm_col[c].rc_size / sizeof (p[0]);
-
-		if (c == rm->rm_firstdatacol) {
-			abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
-			(void) memcpy(q, p, rm->rm_col[c].rc_size);
-			(void) memcpy(r, p, rm->rm_col[c].rc_size);
+	uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
+	uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
+	uint64_t *r = abd_to_buf(rr->rr_col[VDEV_RAIDZ_R].rc_abd);
+	uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
+	ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size ==
+	    rr->rr_col[VDEV_RAIDZ_Q].rc_size);
+	ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size ==
+	    rr->rr_col[VDEV_RAIDZ_R].rc_size);
+
+	for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
+		abd_t *src = rr->rr_col[c].rc_abd;
+
+		uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]);
+
+		if (c == rr->rr_firstdatacol) {
+			abd_copy_to_buf(p, src, rr->rr_col[c].rc_size);
+			(void) memcpy(q, p, rr->rr_col[c].rc_size);
+			(void) memcpy(r, p, rr->rr_col[c].rc_size);
 		} else {
 			struct pqr_struct pqr = { p, q, r };
-			(void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
+			(void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size,
 			    vdev_raidz_pqr_func, &pqr);
 		}
 
-		if (c == rm->rm_firstdatacol) {
-			for (i = ccnt; i < pcnt; i++) {
+		if (c == rr->rr_firstdatacol) {
+			for (uint64_t i = ccnt; i < pcnt; i++) {
+				/* XXX does this really happen? firstdatacol should be the same size as the parity cols */
 				p[i] = 0;
 				q[i] = 0;
 				r[i] = 0;
@@ -782,7 +964,8 @@
 			 * Treat short columns as though they are full of 0s.
 			 * Note that there's therefore nothing needed for P.
 			 */
-			for (i = ccnt; i < pcnt; i++) {
+			uint64_t mask;
+			for (uint64_t i = ccnt; i < pcnt; i++) {
 				VDEV_RAIDZ_64MUL_2(q[i], mask);
 				VDEV_RAIDZ_64MUL_4(r[i], mask);
 			}
@@ -795,17 +978,27 @@
  * parity columns available.
  */
 static void
-vdev_raidz_generate_parity(raidz_map_t *rm)
+vdev_raidz_generate_parity(raidz_row_t *rr)
 {
-	switch (rm->rm_firstdatacol) {
+	if (rr->rr_cols == 0) {
+		/*
+		 * We are handling this block one row at a time (because
+		 * this block has a different logical vs physical width,
+		 * due to RAIDZ expansion), and this is a pad-only row,
+		 * which has no parity.
+		 */
+		return;
+	}
+
+	switch (rr->rr_firstdatacol) {
 	case 1:
-		vdev_raidz_generate_parity_p(rm);
+		vdev_raidz_generate_parity_p(rr);
 		break;
 	case 2:
-		vdev_raidz_generate_parity_pq(rm);
+		vdev_raidz_generate_parity_pq(rr);
 		break;
 	case 3:
-		vdev_raidz_generate_parity_pqr(rm);
+		vdev_raidz_generate_parity_pqr(rr);
 		break;
 	default:
 		cmn_err(CE_PANIC, "invalid RAID-Z configuration");
@@ -929,30 +1122,31 @@
 }
 
 static int
-vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
+vdev_raidz_reconstruct_p(raidz_row_t *rr, int *tgts, int ntgts)
 {
 	int x = tgts[0];
-	int c;
 	abd_t *dst, *src;
 
-	ASSERT(ntgts == 1);
-	ASSERT(x >= rm->rm_firstdatacol);
-	ASSERT(x < rm->rm_cols);
+	zfs_dbgmsg("reconstruct_p(rm=%p x=%u)",
+	    rr, x);
+
+	ASSERT3U(ntgts, ==, 1);
+	ASSERT3U(x, >=, rr->rr_firstdatacol);
+	ASSERT3U(x, <, rr->rr_cols);
 
-	ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_P].rc_size);
-	ASSERT(rm->rm_col[x].rc_size > 0);
+	ASSERT3U(rr->rr_col[x].rc_size, <=, rr->rr_col[VDEV_RAIDZ_P].rc_size);
 
-	src = rm->rm_col[VDEV_RAIDZ_P].rc_abd;
-	dst = rm->rm_col[x].rc_abd;
+	src = rr->rr_col[VDEV_RAIDZ_P].rc_abd;
+	dst = rr->rr_col[x].rc_abd;
 
-	abd_copy(dst, src, rm->rm_col[x].rc_size);
+	abd_copy(dst, src, rr->rr_col[x].rc_size);
 
-	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
-		uint64_t size = MIN(rm->rm_col[x].rc_size,
-		    rm->rm_col[c].rc_size);
+	for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
+		uint64_t size = MIN(rr->rr_col[x].rc_size,
+		    rr->rr_col[c].rc_size);
 
-		src = rm->rm_col[c].rc_abd;
-		dst = rm->rm_col[x].rc_abd;
+		src = rr->rr_col[c].rc_abd;
+		dst = rr->rr_col[x].rc_abd; /* XXX not needed, done above */
 
 		if (c == x)
 			continue;
@@ -965,51 +1159,54 @@
 }
 
 static int
-vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
+vdev_raidz_reconstruct_q(raidz_row_t *rr, int *tgts, int ntgts)
 {
 	int x = tgts[0];
 	int c, exp;
 	abd_t *dst, *src;
 
+	zfs_dbgmsg("reconstruct_q(rm=%p x=%u)",
+	    rr, x);
+
 	ASSERT(ntgts == 1);
 
-	ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_Q].rc_size);
+	ASSERT(rr->rr_col[x].rc_size <= rr->rr_col[VDEV_RAIDZ_Q].rc_size);
 
-	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
-		uint64_t size = (c == x) ? 0 : MIN(rm->rm_col[x].rc_size,
-		    rm->rm_col[c].rc_size);
+	for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
+		uint64_t size = (c == x) ? 0 : MIN(rr->rr_col[x].rc_size,
+		    rr->rr_col[c].rc_size);
 
-		src = rm->rm_col[c].rc_abd;
-		dst = rm->rm_col[x].rc_abd;
+		src = rr->rr_col[c].rc_abd;
+		dst = rr->rr_col[x].rc_abd;
 
-		if (c == rm->rm_firstdatacol) {
+		if (c == rr->rr_firstdatacol) {
 			abd_copy(dst, src, size);
-			if (rm->rm_col[x].rc_size > size)
+			if (rr->rr_col[x].rc_size > size)
 				abd_zero_off(dst, size,
-				    rm->rm_col[x].rc_size - size);
+				    rr->rr_col[x].rc_size - size);
 		} else {
-			ASSERT3U(size, <=, rm->rm_col[x].rc_size);
+			ASSERT3U(size, <=, rr->rr_col[x].rc_size);
 			(void) abd_iterate_func2(dst, src, 0, 0, size,
 			    vdev_raidz_reconst_q_pre_func, NULL);
 			(void) abd_iterate_func(dst,
-			    size, rm->rm_col[x].rc_size - size,
+			    size, rr->rr_col[x].rc_size - size,
 			    vdev_raidz_reconst_q_pre_tail_func, NULL);
 		}
 	}
 
-	src = rm->rm_col[VDEV_RAIDZ_Q].rc_abd;
-	dst = rm->rm_col[x].rc_abd;
-	exp = 255 - (rm->rm_cols - 1 - x);
+	src = rr->rr_col[VDEV_RAIDZ_Q].rc_abd;
+	dst = rr->rr_col[x].rc_abd;
+	exp = 255 - (rr->rr_cols - 1 - x);
 
 	struct reconst_q_struct rq = { abd_to_buf(src), exp };
-	(void) abd_iterate_func(dst, 0, rm->rm_col[x].rc_size,
+	(void) abd_iterate_func(dst, 0, rr->rr_col[x].rc_size,
 	    vdev_raidz_reconst_q_post_func, &rq);
 
 	return (1 << VDEV_RAIDZ_Q);
 }
 
 static int
-vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
+vdev_raidz_reconstruct_pq(raidz_row_t *rr, int *tgts, int ntgts)
 {
 	uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp;
 	abd_t *pdata, *qdata;
@@ -1018,12 +1215,15 @@
 	int y = tgts[1];
 	abd_t *xd, *yd;
 
+	zfs_dbgmsg("reconstruct_pq(rm=%p x=%u y=%u)",
+	    rr, x, y);
+
 	ASSERT(ntgts == 2);
 	ASSERT(x < y);
-	ASSERT(x >= rm->rm_firstdatacol);
-	ASSERT(y < rm->rm_cols);
+	ASSERT(x >= rr->rr_firstdatacol);
+	ASSERT(y < rr->rr_cols);
 
-	ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
+	ASSERT(rr->rr_col[x].rc_size >= rr->rr_col[y].rc_size);
 
 	/*
 	 * Move the parity data aside -- we're going to compute parity as
@@ -1032,29 +1232,29 @@
 	 * parity so we make those columns appear to be full of zeros by
 	 * setting their lengths to zero.
 	 */
-	pdata = rm->rm_col[VDEV_RAIDZ_P].rc_abd;
-	qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_abd;
-	xsize = rm->rm_col[x].rc_size;
-	ysize = rm->rm_col[y].rc_size;
+	pdata = rr->rr_col[VDEV_RAIDZ_P].rc_abd;
+	qdata = rr->rr_col[VDEV_RAIDZ_Q].rc_abd;
+	xsize = rr->rr_col[x].rc_size;
+	ysize = rr->rr_col[y].rc_size;
 
-	rm->rm_col[VDEV_RAIDZ_P].rc_abd =
-	    abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_P].rc_size, B_TRUE);
-	rm->rm_col[VDEV_RAIDZ_Q].rc_abd =
-	    abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_Q].rc_size, B_TRUE);
-	rm->rm_col[x].rc_size = 0;
-	rm->rm_col[y].rc_size = 0;
+	rr->rr_col[VDEV_RAIDZ_P].rc_abd =
+	    abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_P].rc_size, B_TRUE);
+	rr->rr_col[VDEV_RAIDZ_Q].rc_abd =
+	    abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_Q].rc_size, B_TRUE);
+	rr->rr_col[x].rc_size = 0;
+	rr->rr_col[y].rc_size = 0;
 
-	vdev_raidz_generate_parity_pq(rm);
+	vdev_raidz_generate_parity_pq(rr);
 
-	rm->rm_col[x].rc_size = xsize;
-	rm->rm_col[y].rc_size = ysize;
+	rr->rr_col[x].rc_size = xsize;
+	rr->rr_col[y].rc_size = ysize;
 
 	p = abd_to_buf(pdata);
 	q = abd_to_buf(qdata);
-	pxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
-	qxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
-	xd = rm->rm_col[x].rc_abd;
-	yd = rm->rm_col[y].rc_abd;
+	pxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
+	qxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
+	xd = rr->rr_col[x].rc_abd;
+	yd = rr->rr_col[y].rc_abd;
 
 	/*
 	 * We now have:
@@ -1072,7 +1272,7 @@
 	 */
 
 	a = vdev_raidz_pow2[255 + x - y];
-	b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
+	b = vdev_raidz_pow2[255 - (rr->rr_cols - 1 - x)];
 	tmp = 255 - vdev_raidz_log2[a ^ 1];
 
 	aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
@@ -1085,14 +1285,14 @@
 	(void) abd_iterate_func(xd, ysize, xsize - ysize,
 	    vdev_raidz_reconst_pq_tail_func, &rpq);
 
-	abd_free(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
-	abd_free(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
+	abd_free(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
+	abd_free(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
 
 	/*
 	 * Restore the saved parity data.
 	 */
-	rm->rm_col[VDEV_RAIDZ_P].rc_abd = pdata;
-	rm->rm_col[VDEV_RAIDZ_Q].rc_abd = qdata;
+	rr->rr_col[VDEV_RAIDZ_P].rc_abd = pdata;
+	rr->rr_col[VDEV_RAIDZ_Q].rc_abd = qdata;
 
 	return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
 }
@@ -1249,13 +1449,13 @@
 /* END CSTYLED */
 
 static void
-vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
+vdev_raidz_matrix_init(raidz_row_t *rr, int n, int nmap, int *map,
     uint8_t **rows)
 {
 	int i, j;
 	int pow;
 
-	ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
+	ASSERT(n == rr->rr_cols - rr->rr_firstdatacol);
 
 	/*
 	 * Fill in the missing rows of interest.
@@ -1279,7 +1479,7 @@
 }
 
 static void
-vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
+vdev_raidz_matrix_invert(raidz_row_t *rr, int n, int nmissing, int *missing,
     uint8_t **rows, uint8_t **invrows, const uint8_t *used)
 {
 	int i, j, ii, jj;
@@ -1291,10 +1491,10 @@
 	 * correspond to data columns.
 	 */
 	for (i = 0; i < nmissing; i++) {
-		ASSERT3S(used[i], <, rm->rm_firstdatacol);
+		ASSERT3S(used[i], <, rr->rr_firstdatacol);
 	}
 	for (; i < n; i++) {
-		ASSERT3S(used[i], >=, rm->rm_firstdatacol);
+		ASSERT3S(used[i], >=, rr->rr_firstdatacol);
 	}
 
 	/*
@@ -1311,8 +1511,8 @@
 	 */
 	for (i = 0; i < nmissing; i++) {
 		for (j = nmissing; j < n; j++) {
-			ASSERT3U(used[j], >=, rm->rm_firstdatacol);
-			jj = used[j] - rm->rm_firstdatacol;
+			ASSERT3U(used[j], >=, rr->rr_firstdatacol);
+			jj = used[j] - rr->rr_firstdatacol;
 			ASSERT3S(jj, <, n);
 			invrows[i][j] = rows[i][jj];
 			rows[i][jj] = 0;
@@ -1373,7 +1573,7 @@
 }
 
 static void
-vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
+vdev_raidz_matrix_reconstruct(raidz_row_t *rr, int n, int nmissing,
     int *missing, uint8_t **invrows, const uint8_t *used)
 {
 	int i, j, x, cc, c;
@@ -1405,22 +1605,24 @@
 
 	for (i = 0; i < n; i++) {
 		c = used[i];
-		ASSERT3U(c, <, rm->rm_cols);
+		ASSERT3U(c, <, rr->rr_cols);
 
-		src = abd_to_buf(rm->rm_col[c].rc_abd);
-		ccount = rm->rm_col[c].rc_size;
+		ccount = rr->rr_col[c].rc_size;
+		ASSERT(ccount >= rr->rr_col[missing[0]].rc_size || i > 0);
+		if (ccount == 0)
+			continue;
+		src = abd_to_buf(rr->rr_col[c].rc_abd);
 		for (j = 0; j < nmissing; j++) {
-			cc = missing[j] + rm->rm_firstdatacol;
-			ASSERT3U(cc, >=, rm->rm_firstdatacol);
-			ASSERT3U(cc, <, rm->rm_cols);
+			cc = missing[j] + rr->rr_firstdatacol;
+			ASSERT3U(cc, >=, rr->rr_firstdatacol);
+			ASSERT3U(cc, <, rr->rr_cols);
 			ASSERT3U(cc, !=, c);
 
-			dst[j] = abd_to_buf(rm->rm_col[cc].rc_abd);
-			dcount[j] = rm->rm_col[cc].rc_size;
+			dcount[j] = rr->rr_col[cc].rc_size;
+			if (dcount[j] != 0)
+				dst[j] = abd_to_buf(rr->rr_col[cc].rc_abd);
 		}
 
-		ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
-
 		for (x = 0; x < ccount; x++, src++) {
 			if (*src != 0)
 				log = vdev_raidz_log2[*src];
@@ -1449,13 +1651,15 @@
 }
 
 static int
-vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
+vdev_raidz_reconstruct_general(raidz_row_t *rr, int *tgts, int ntgts)
 {
 	int n, i, c, t, tt;
 	int nmissing_rows;
 	int missing_rows[VDEV_RAIDZ_MAXPARITY];
 	int parity_map[VDEV_RAIDZ_MAXPARITY];
 
+	zfs_dbgmsg("reconstruct_general(rm=%p ntgts=%u)",
+	    rr, ntgts);
 	uint8_t *p, *pp;
 	size_t psize;
 
@@ -1471,28 +1675,31 @@
 	 * Matrix reconstruction can't use scatter ABDs yet, so we allocate
 	 * temporary linear ABDs.
 	 */
-	if (!abd_is_linear(rm->rm_col[rm->rm_firstdatacol].rc_abd)) {
-		bufs = kmem_alloc(rm->rm_cols * sizeof (abd_t *), KM_PUSHPAGE);
+	if (!abd_is_linear(rr->rr_col[rr->rr_firstdatacol].rc_abd)) {
+		bufs = kmem_alloc(rr->rr_cols * sizeof (abd_t *), KM_PUSHPAGE);
 
-		for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
-			raidz_col_t *col = &rm->rm_col[c];
+		for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
+			raidz_col_t *col = &rr->rr_col[c];
 
 			bufs[c] = col->rc_abd;
-			col->rc_abd = abd_alloc_linear(col->rc_size, B_TRUE);
-			abd_copy(col->rc_abd, bufs[c], col->rc_size);
+			if (bufs[c] != NULL) {
+                                col->rc_abd =
+                                    abd_alloc_linear(col->rc_size, B_TRUE);
+                                abd_copy(col->rc_abd, bufs[c], col->rc_size);
+			}
 		}
 	}
 
-	n = rm->rm_cols - rm->rm_firstdatacol;
+	n = rr->rr_cols - rr->rr_firstdatacol;
 
 	/*
 	 * Figure out which data columns are missing.
 	 */
 	nmissing_rows = 0;
 	for (t = 0; t < ntgts; t++) {
-		if (tgts[t] >= rm->rm_firstdatacol) {
+		if (tgts[t] >= rr->rr_firstdatacol) {
 			missing_rows[nmissing_rows++] =
-			    tgts[t] - rm->rm_firstdatacol;
+			    tgts[t] - rr->rr_firstdatacol;
 		}
 	}
 
@@ -1502,7 +1709,7 @@
 	 */
 	for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
 		ASSERT(tt < ntgts);
-		ASSERT(c < rm->rm_firstdatacol);
+		ASSERT(c < rr->rr_firstdatacol);
 
 		/*
 		 * Skip any targeted parity columns.
@@ -1537,9 +1744,9 @@
 		used[i] = parity_map[i];
 	}
 
-	for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
+	for (tt = 0, c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
 		if (tt < nmissing_rows &&
-		    c == missing_rows[tt] + rm->rm_firstdatacol) {
+		    c == missing_rows[tt] + rr->rr_firstdatacol) {
 			tt++;
 			continue;
 		}
@@ -1552,18 +1759,18 @@
 	/*
 	 * Initialize the interesting rows of the matrix.
 	 */
-	vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
+	vdev_raidz_matrix_init(rr, n, nmissing_rows, parity_map, rows);
 
 	/*
 	 * Invert the matrix.
 	 */
-	vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
+	vdev_raidz_matrix_invert(rr, n, nmissing_rows, missing_rows, rows,
 	    invrows, used);
 
 	/*
 	 * Reconstruct the missing data using the generated matrix.
 	 */
-	vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
+	vdev_raidz_matrix_reconstruct(rr, n, nmissing_rows, missing_rows,
 	    invrows, used);
 
 	kmem_free(p, psize);
@@ -1572,21 +1779,23 @@
 	 * copy back from temporary linear abds and free them
 	 */
 	if (bufs) {
-		for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
-			raidz_col_t *col = &rm->rm_col[c];
+		for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
+			raidz_col_t *col = &rr->rr_col[c];
 
-			abd_copy(bufs[c], col->rc_abd, col->rc_size);
-			abd_free(col->rc_abd);
+			if (bufs[c] != NULL) {
+                                abd_copy(bufs[c], col->rc_abd, col->rc_size);
+                                abd_free(col->rc_abd);
+			}
 			col->rc_abd = bufs[c];
 		}
-		kmem_free(bufs, rm->rm_cols * sizeof (abd_t *));
+		kmem_free(bufs, rr->rr_cols * sizeof (abd_t *));
 	}
 
 	return (code);
 }
 
 static int
-vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
+vdev_raidz_reconstruct_row(raidz_row_t *rr, int *t, int nt)
 {
 	int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
 	int ntgts;
@@ -1595,26 +1804,37 @@
 	int nbadparity, nbaddata;
 	int parity_valid[VDEV_RAIDZ_MAXPARITY];
 
+	zfs_dbgmsg("reconstruct(rm=%p nt=%u cols=%u md=%u mp=%u)",
+	    rr, nt, (int)rr->rr_cols, (int)rr->rr_missingdata, (int)rr->rr_missingparity);
+
 	/*
 	 * The tgts list must already be sorted.
 	 */
+	zfs_dbgmsg("reconstruct(rm=%p t[%u]=%u)", rr, 0, t[0]);
 	for (i = 1; i < nt; i++) {
+		zfs_dbgmsg("reconstruct(rm=%p t[%u]=%u)",
+		    rr, i, t[i]);
 		ASSERT(t[i] > t[i - 1]);
 	}
 
-	nbadparity = rm->rm_firstdatacol;
-	nbaddata = rm->rm_cols - nbadparity;
+	nbadparity = rr->rr_firstdatacol;
+	nbaddata = rr->rr_cols - nbadparity;
 	ntgts = 0;
-	for (i = 0, c = 0; c < rm->rm_cols; c++) {
-		if (c < rm->rm_firstdatacol)
+	for (i = 0, c = 0; c < rr->rr_cols; c++) {
+		zfs_dbgmsg("reconstruct(rm=%p col=%u devid=%u offset=%llx error=%u)",
+		    rr, c,
+		    (int)rr->rr_col[c].rc_devidx,
+		    (long long)rr->rr_col[c].rc_offset,
+		    (int)rr->rr_col[c].rc_error);
+		if (c < rr->rr_firstdatacol)
 			parity_valid[c] = B_FALSE;
 
 		if (i < nt && c == t[i]) {
 			tgts[ntgts++] = c;
 			i++;
-		} else if (rm->rm_col[c].rc_error != 0) {
+		} else if (rr->rr_col[c].rc_error != 0) {
 			tgts[ntgts++] = c;
-		} else if (c >= rm->rm_firstdatacol) {
+		} else if (c >= rr->rr_firstdatacol) {
 			nbaddata--;
 		} else {
 			parity_valid[c] = B_TRUE;
@@ -1635,30 +1855,30 @@
 		switch (nbaddata) {
 		case 1:
 			if (parity_valid[VDEV_RAIDZ_P])
-				return (vdev_raidz_reconstruct_p(rm, dt, 1));
+				return (vdev_raidz_reconstruct_p(rr, dt, 1));
 
-			ASSERT(rm->rm_firstdatacol > 1);
+			ASSERT(rr->rr_firstdatacol > 1);
 
 			if (parity_valid[VDEV_RAIDZ_Q])
-				return (vdev_raidz_reconstruct_q(rm, dt, 1));
+				return (vdev_raidz_reconstruct_q(rr, dt, 1));
 
-			ASSERT(rm->rm_firstdatacol > 2);
+			ASSERT(rr->rr_firstdatacol > 2);
 			break;
 
 		case 2:
-			ASSERT(rm->rm_firstdatacol > 1);
+			ASSERT(rr->rr_firstdatacol > 1);
 
 			if (parity_valid[VDEV_RAIDZ_P] &&
 			    parity_valid[VDEV_RAIDZ_Q])
-				return (vdev_raidz_reconstruct_pq(rm, dt, 2));
+				return (vdev_raidz_reconstruct_pq(rr, dt, 2));
 
-			ASSERT(rm->rm_firstdatacol > 2);
+			ASSERT(rr->rr_firstdatacol > 2);
 
 			break;
 		}
 	}
 
-	code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
+	code = vdev_raidz_reconstruct_general(rr, tgts, ntgts);
 	ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
 	ASSERT(code > 0);
 	return (code);
@@ -1668,8 +1888,8 @@
 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
     uint64_t *logical_ashift, uint64_t *physical_ashift)
 {
-	vdev_t *cvd;
-	uint64_t nparity = vd->vdev_nparity;
+	vdev_raidz_t *vdrz = vd->vdev_tsd;
+	uint64_t nparity = vdrz->vd_nparity;
 	int c;
 	int lasterror = 0;
 	int numerrors = 0;
@@ -1685,7 +1905,7 @@
 	vdev_open_children(vd);
 
 	for (c = 0; c < vd->vdev_children; c++) {
-		cvd = vd->vdev_child[c];
+		vdev_t *cvd = vd->vdev_child[c];
 
 		if (cvd->vdev_open_error != 0) {
 			lasterror = cvd->vdev_open_error;
@@ -1786,9 +2006,10 @@
 vdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size,
     uint64_t offset, uint64_t origoffset, boolean_t doread, boolean_t isdump)
 {
+	vdev_raidz_t *vdrz = vd->vdev_tsd;
 	vdev_t *tvd = vd->vdev_top;
 	vdev_t *cvd;
-	raidz_map_t *rm;
+	raidz_row_t *rr;
 	raidz_col_t *rc;
 	int c, err = 0;
 
@@ -1818,15 +2039,19 @@
 	 */
 	abd_t *abd = abd_get_from_buf(data - (offset - origoffset),
 	    SPA_OLD_MAXBLOCKSIZE);
-	rm = vdev_raidz_map_alloc(abd,
+	/*
+	 * XXX deal with dump to expanded raidz
+	 */
+	raidz_map_t *rm = vdev_raidz_map_alloc(abd,
 	    SPA_OLD_MAXBLOCKSIZE, origoffset, B_FALSE, tvd->vdev_ashift,
-	    vd->vdev_children, vd->vdev_nparity);
+	    vd->vdev_children, vdrz->vd_nparity);
+	rr = rm->rm_row[0];
 
 	coloffset = origoffset;
 
-	for (c = rm->rm_firstdatacol; c < rm->rm_cols;
+	for (c = rr->rr_firstdatacol; c < rr->rr_cols;
 	    c++, coloffset += rc->rc_size) {
-		rc = &rm->rm_col[c];
+		rc = &rr->rr_col[c];
 		cvd = vd->vdev_child[rc->rc_devidx];
 
 		/*
@@ -1863,7 +2088,7 @@
 			break;
 	}
 
-	vdev_raidz_map_free(rm);
+	vdev_raidz_row_free(rr);
 	abd_put(abd);
 #endif	/* KERNEL */
 
@@ -1874,10 +2099,11 @@
 static uint64_t
 vdev_raidz_asize(vdev_t *vd, uint64_t psize)
 {
+	vdev_raidz_t *vdrz = vd->vdev_tsd;
 	uint64_t asize;
 	uint64_t ashift = vd->vdev_top->vdev_ashift;
-	uint64_t cols = vd->vdev_children;
-	uint64_t nparity = vd->vdev_nparity;
+	uint64_t cols = vdrz->vd_logical_width;
+	uint64_t nparity = vdrz->vd_nparity;
 
 	asize = ((psize - 1) >> ashift) + 1;
 	asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
@@ -1896,119 +2122,137 @@
 	rc->rc_skipped = 0;
 }
 
-/*
- * Start an IO operation on a RAIDZ VDev
- *
- * Outline:
- * - For write operations:
- *   1. Generate the parity data
- *   2. Create child zio write operations to each column's vdev, for both
- *      data and parity.
- *   3. If the column skips any sectors for padding, create optional dummy
- *      write zio children for those areas to improve aggregation continuity.
- * - For read operations:
- *   1. Create child zio read operations to each data column's vdev to read
- *      the range of data required for zio.
- *   2. If this is a scrub or resilver operation, or if any of the data
- *      vdevs have had errors, then create zio read operations to the parity
- *      columns' VDevs as well.
- */
 static void
-vdev_raidz_io_start(zio_t *zio)
+vdev_raidz_io_verify(zio_t *zio, raidz_row_t *rr, int col)
 {
+#if 0
+#ifdef ZFS_DEBUG
 	vdev_t *vd = zio->io_vd;
 	vdev_t *tvd = vd->vdev_top;
-	vdev_t *cvd;
-	raidz_map_t *rm;
-	raidz_col_t *rc;
-	int c, i;
 
-	rm = vdev_raidz_map_alloc(zio->io_abd, zio->io_size, zio->io_offset,
-	    zio->io_type == ZIO_TYPE_FREE,
-	    tvd->vdev_ashift, vd->vdev_children,
-	    vd->vdev_nparity);
+	range_seg_t logical_rs, physical_rs;
+	logical_rs.rs_start = zio->io_offset;
+	logical_rs.rs_end = logical_rs.rs_start +
+	    vdev_raidz_asize(zio->io_vd, zio->io_size);
 
-	zio->io_vsd = rm;
-	zio->io_vsd_ops = &vdev_raidz_vsd_ops;
+	raidz_col_t *rc = &rr->rr_col[col];
+	vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
 
-	ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
+	vdev_xlate(cvd, &logical_rs, &physical_rs);
+	ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start);
+	ASSERT3U(rc->rc_offset, <, physical_rs.rs_end);
+	/*
+	 * It would be nice to assert that rs_end is equal
+	 * to rc_offset + rc_size but there might be an
+	 * optional I/O at the end that is not accounted in
+	 * rc_size.
+	 */
+	if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) {
+		ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset +
+		    rc->rc_size + (1 << tvd->vdev_ashift));
+	} else {
+		ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size);
+	}
+#endif
+#endif
+}
 
-	if (zio->io_type == ZIO_TYPE_FREE) {
-		for (c = 0; c < rm->rm_cols; c++) {
-			rc = &rm->rm_col[c];
-			cvd = vd->vdev_child[rc->rc_devidx];
-			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
-			    rc->rc_offset, rc->rc_abd, rc->rc_size,
-			    zio->io_type, zio->io_priority, 0,
-			    vdev_raidz_child_done, rc));
-		}
+static void
+vdev_raidz_io_start_free(zio_t *zio, raidz_row_t *rr)
+{
+	vdev_t *vd = zio->io_vd;
+	vdev_t *tvd = vd->vdev_top;
 
-		zio_execute(zio);
-		return;
+	for (int c = 0; c < rr->rr_cols; c++) {
+		raidz_col_t *rc = &rr->rr_col[c];
+		if (rc->rc_size == 0)
+			continue;
+		vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
+
+		zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
+		    rc->rc_offset, rc->rc_abd, rc->rc_size,
+		    zio->io_type, zio->io_priority, 0,
+		    vdev_raidz_child_done, rc));
 	}
+}
 
-	if (zio->io_type == ZIO_TYPE_WRITE) {
-		vdev_raidz_generate_parity(rm);
+static void
+vdev_raidz_io_start_write(zio_t *zio, raidz_row_t *rr)
+{
+	vdev_t *vd = zio->io_vd;
+	vdev_t *tvd = vd->vdev_top;
 
-		for (c = 0; c < rm->rm_cols; c++) {
-			rc = &rm->rm_col[c];
-			cvd = vd->vdev_child[rc->rc_devidx];
-			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
-			    rc->rc_offset, rc->rc_abd, rc->rc_size,
-			    zio->io_type, zio->io_priority, 0,
-			    vdev_raidz_child_done, rc));
-		}
+	vdev_raidz_generate_parity(rr);
 
-		/*
-		 * Generate optional I/Os for any skipped sectors to improve
-		 * aggregation contiguity.
-		 */
-		for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
-			ASSERT(c <= rm->rm_scols);
-			if (c == rm->rm_scols)
-				c = 0;
-			rc = &rm->rm_col[c];
-			cvd = vd->vdev_child[rc->rc_devidx];
-			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
-			    rc->rc_offset + rc->rc_size, NULL,
-			    1 << tvd->vdev_ashift,
-			    zio->io_type, zio->io_priority,
-			    ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
-		}
+	for (int c = 0; c < rr->rr_cols; c++) {
+		raidz_col_t *rc = &rr->rr_col[c];
+		if (rc->rc_size == 0)
+			continue;
+		vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
 
-		zio_execute(zio);
-		return;
+		zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
+		    rc->rc_offset, rc->rc_abd, rc->rc_size,
+		    zio->io_type, zio->io_priority, 0,
+		    vdev_raidz_child_done, rc));
+	}
+
+	/* XXX do this in vdev_raidz_io_start, based on nskip stored in rm
+	 */
+#if 0
+	/*
+	 * Generate optional I/Os for any skipped sectors to improve
+	 * aggregation contiguity.
+	 */
+	for (int c = rr->rm_skipstart, i = 0; i < rr->rm_nskip; c++, i++) {
+		ASSERT(c <= rr->rm_scols);
+		if (c == rr->rm_scols)
+			c = 0;
+		raidz_col_t *rc = &rr->rr_col[c];
+		vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
+		zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
+		    rc->rc_offset + rc->rc_size, NULL,
+		    1 << tvd->vdev_ashift,
+		    zio->io_type, zio->io_priority,
+		    ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
 	}
+#endif
+}
 
-	ASSERT(zio->io_type == ZIO_TYPE_READ);
+static void
+vdev_raidz_io_start_read(zio_t *zio, raidz_row_t *rr, boolean_t forceparity)
+{
+	vdev_t *vd = zio->io_vd;
 
 	/*
 	 * Iterate over the columns in reverse order so that we hit the parity
 	 * last -- any errors along the way will force us to read the parity.
 	 */
-	for (c = rm->rm_cols - 1; c >= 0; c--) {
-		rc = &rm->rm_col[c];
-		cvd = vd->vdev_child[rc->rc_devidx];
+	for (int c = rr->rr_cols - 1; c >= 0; c--) {
+		raidz_col_t *rc = &rr->rr_col[c];
+		if (rc->rc_size == 0)
+			continue;
+		vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
 		if (!vdev_readable(cvd)) {
-			if (c >= rm->rm_firstdatacol)
-				rm->rm_missingdata++;
+			if (c >= rr->rr_firstdatacol)
+				rr->rr_missingdata++;
 			else
-				rm->rm_missingparity++;
+				rr->rr_missingparity++;
 			rc->rc_error = SET_ERROR(ENXIO);
 			rc->rc_tried = 1;	/* don't even try */
 			rc->rc_skipped = 1;
 			continue;
 		}
 		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
-			if (c >= rm->rm_firstdatacol)
-				rm->rm_missingdata++;
+			if (c >= rr->rr_firstdatacol)
+				rr->rr_missingdata++;
 			else
-				rm->rm_missingparity++;
+				rr->rr_missingparity++;
 			rc->rc_error = SET_ERROR(ESTALE);
 			rc->rc_skipped = 1;
 			continue;
 		}
-		if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
+		if (forceparity ||
+		    c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 ||
 		    (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
 			    rc->rc_offset, rc->rc_abd, rc->rc_size,
@@ -2016,6 +2260,75 @@
 			    vdev_raidz_child_done, rc));
 		}
 	}
+}
+
+/*
+ * Start an IO operation on a RAIDZ VDev
+ *
+ * Outline:
+ * - For write operations:
+ *   1. Generate the parity data
+ *   2. Create child zio write operations to each column's vdev, for both
+ *      data and parity.
+ *   3. If the column skips any sectors for padding, create optional dummy
+ *      write zio children for those areas to improve aggregation continuity.
+ * - For read operations:
+ *   1. Create child zio read operations to each data column's vdev to read
+ *      the range of data required for zio.
+ *   2. If this is a scrub or resilver operation, or if any of the data
+ *      vdevs have had errors, then create zio read operations to the parity
+ *      columns' VDevs as well.
+ */
+static void
+vdev_raidz_io_start(zio_t *zio)
+{
+	vdev_t *vd = zio->io_vd;
+	vdev_t *tvd = vd->vdev_top;
+	vdev_raidz_t *vdrz = vd->vdev_tsd;
+	raidz_map_t *rm;
+
+	ASSERT(!vdrz->vn_expanding);
+
+	if (vdrz->vd_logical_width != vdrz->vd_physical_width) {
+                rm = vdev_raidz_map_alloc_expanded(zio->io_abd,
+		    zio->io_size, zio->io_offset,
+		    zio->io_type == ZIO_TYPE_FREE,
+                    tvd->vdev_ashift, vdrz->vd_physical_width,
+                    vdrz->vd_logical_width, vdrz->vd_nparity);
+	} else {
+                rm = vdev_raidz_map_alloc(zio->io_abd,
+                    zio->io_size, zio->io_offset,
+		    zio->io_type == ZIO_TYPE_FREE,
+                    tvd->vdev_ashift, vdrz->vd_logical_width,
+                    vdrz->vd_nparity);
+	}
+
+        zio->io_vsd = rm;
+        zio->io_vsd_ops = &vdev_raidz_vsd_ops;
+
+	if (zio->io_type == ZIO_TYPE_FREE) {
+		for (int i = 0; i < rm->rm_nrows; i++) {
+			vdev_raidz_io_start_free(zio, rm->rm_row[i]);
+		}
+	} else if (zio->io_type == ZIO_TYPE_WRITE) {
+                for (int i = 0; i < rm->rm_nrows; i++) {
+                        vdev_raidz_io_start_write(zio,
+                            rm->rm_row[i]);
+                }
+        } else {
+                ASSERT(zio->io_type == ZIO_TYPE_READ);
+                /*
+                 * If there are multiple rows, we will be hitting
+                 * all disks, so go ahead and read the parity so
+                 * that we are reading in decent size chunks.
+                 * XXX maybe doesn't really matter?
+                 */
+                boolean_t forceparity = rm->rm_nrows > 1;
+                for (int i = 0; i < rm->rm_nrows; i++) {
+                        vdev_raidz_io_start_read(zio,
+                            rm->rm_row[i], forceparity);
+                }
+        }
 
 	zio_execute(zio);
 }
@@ -2070,10 +2383,10 @@
  * Generate the parity from the data columns. If we tried and were able to
  * read the parity without error, verify that the generated parity matches the
  * data we read. If it doesn't, we fire off a checksum error. Return the
- * number such failures.
+ * number of such failures.
  */
 static int
-raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
+raidz_parity_verify(zio_t *zio, raidz_row_t *rr)
 {
 	void *orig[VDEV_RAIDZ_MAXPARITY];
 	int c, ret = 0;
@@ -2086,21 +2399,29 @@
 	if (checksum == ZIO_CHECKSUM_NOPARITY)
 		return (ret);
 
-	for (c = 0; c < rm->rm_firstdatacol; c++) {
-		rc = &rm->rm_col[c];
+	for (c = 0; c < rr->rr_firstdatacol; c++) {
+		rc = &rr->rr_col[c];
 		if (!rc->rc_tried || rc->rc_error != 0)
 			continue;
 		orig[c] = zio_buf_alloc(rc->rc_size);
 		abd_copy_to_buf(orig[c], rc->rc_abd, rc->rc_size);
 	}
 
-	vdev_raidz_generate_parity(rm);
+	/* XXX regenerates parity even for !tried||rc_error!=0
+	 * This could cause a side effect of fixing stuff we didn't realize
+	 * was necessary (i.e. even if we return 0)
+	 */
+	vdev_raidz_generate_parity(rr);
+
+	for (c = 0; c < rr->rr_firstdatacol; c++) {
+		rc = &rr->rr_col[c];
 
-	for (c = 0; c < rm->rm_firstdatacol; c++) {
-		rc = &rm->rm_col[c];
 		if (!rc->rc_tried || rc->rc_error != 0)
 			continue;
+
 		if (abd_cmp_buf(rc->rc_abd, orig[c], rc->rc_size) != 0) {
+			zfs_dbgmsg("raidz_parity_verify found error on col=%u devidx=%u",
+			    c, (int)rc->rc_devidx);
 			raidz_checksum_error(zio, rc, orig[c]);
 			rc->rc_error = SET_ERROR(ECKSUM);
 			ret++;
@@ -2117,16 +2438,83 @@
 static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
 
 static int
-vdev_raidz_worst_error(raidz_map_t *rm)
+vdev_raidz_worst_error(raidz_row_t *rr)
 {
 	int error = 0;
 
-	for (int c = 0; c < rm->rm_cols; c++)
-		error = zio_worst_error(error, rm->rm_col[c].rc_error);
+	for (int c = 0; c < rr->rr_cols; c++)
+		error = zio_worst_error(error, rr->rr_col[c].rc_error);
 
 	return (error);
 }
 
+static void
+vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr)
+{
+	int unexpected_errors = 0;
+	int parity_errors = 0;
+	int parity_untried = 0;
+	int data_errors = 0;
+
+	ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
+
+	for (int c = 0; c < rr->rr_cols; c++) {
+		raidz_col_t *rc = &rr->rr_col[c];
+
+		if (rc->rc_error) {
+			if (c < rr->rr_firstdatacol)
+				parity_errors++;
+			else
+				data_errors++;
+
+			if (!rc->rc_skipped)
+				unexpected_errors++;
+		} else if (c < rr->rr_firstdatacol && !rc->rc_tried) {
+			parity_untried++;
+		}
+	}
+
+	/*
+	 * If we read more parity disks than were used for
+	 * reconstruction, confirm that the other parity disks produced
+	 * correct data.
+	 *
+	 * Note that we also regenerate parity when resilvering so we
+	 * can write it out to failed devices later.
+	 */
+	zfs_dbgmsg("parity_errors=%u parity_untried=%u data_errors=%u verifying=%s",
+	    parity_errors, parity_untried, data_errors,
+	    (parity_errors + parity_untried < rr->rr_firstdatacol - data_errors) ? "yes" : "no");
+	if (parity_errors + parity_untried <
+	    rr->rr_firstdatacol - data_errors ||
+	    (zio->io_flags & ZIO_FLAG_RESILVER)) {
+		int n = raidz_parity_verify(zio, rr);
+		unexpected_errors += n;
+		ASSERT3U(parity_errors + n, <=, rr->rr_firstdatacol);
+	}
+
+	if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
+	    (unexpected_errors > 0 || (zio->io_flags & ZIO_FLAG_RESILVER))) {
+		/*
+		 * Use the good data we have in hand to repair damaged children.
+		 */
+		for (int c = 0; c < rr->rr_cols; c++) {
+			raidz_col_t *rc = &rr->rr_col[c];
+			vdev_t *vd = zio->io_vd;
+			vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
+
+			if (rc->rc_error == 0 || rc->rc_size == 0)
+				continue;
+
+			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
+			    rc->rc_offset, rc->rc_abd, rc->rc_size,
+			    ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
+			    ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
+			    ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
+		}
+	}
+}
+
 /*
  * Iterate over all combinations of bad data and attempt a reconstruction.
  * Note that the algorithm below is non-optimal because it doesn't take into
@@ -2134,454 +2522,771 @@
  * triple-parity RAID-Z the reconstruction procedure is the same if column 4
  * is targeted as invalid as if columns 1 and 4 are targeted since in both
  * cases we'd only use parity information in column 0.
+ *
+ * The order that we find the various possible combinations of failed
+ * disks is dictated by these rules:
+ * - Examine each "slot" (the "i" in tgts[i])
+ *   - Try to increment this slot (tgts[i] = tgts[i] + 1)
+ *   - if we can't increment because it runs into the next slot,
+ *     reset our slot to the minimum, and examine the next slot
+ *  For example, with a 6-wide RAIDZ3, and no known errors (so we have to choose
+ *  3 columns to reconstruct), we will generate the following sequence:
+ *
+ *  STATE        ACTION
+ *  0 1 2        special case: skip since these are all parity
+ *  0 1   3      first slot: reset to 0; middle slot: increment to 2
+ *  0   2 3      first slot: increment to 1
+ *    1 2 3      first: reset to 0; middle: reset to 1; last: increment to 4
+ *  0 1     4    first: reset to 0; middle: increment to 2
+ *  0   2   4    first: increment to 1
+ *    1 2   4    first: reset to 0; middle: increment to 3
+ *  0     3 4    first: increment to 1
+ *    1   3 4    first: increment to 2
+ *      2 3 4    first: reset to 0; middle: reset to 1; last: increment to 5
+ *  0 1       5  first: reset to 0; middle: increment to 2
+ *  0   2     5  first: increment to 1
+ *    1 2     5  first: reset to 0; middle: increment to 3
+ *  0     3   5  first: increment to 1
+ *    1   3   5  first: increment to 2
+ *      2 3   5  first: reset to 0; middle: increment to 4
+ *  0       4 5  first: increment to 1
+ *    1     4 5  first: increment to 2
+ *      2   4 5  first: increment to 3
+ *        3 4 5  done
  */
-static int
-vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
-{
-	raidz_map_t *rm = zio->io_vsd;
-	raidz_col_t *rc;
-	void *orig[VDEV_RAIDZ_MAXPARITY];
-	int tstore[VDEV_RAIDZ_MAXPARITY + 2];
-	int *tgts = &tstore[1];
-	int current, next, i, c, n;
-	int code, ret = 0;
 
-	ASSERT(total_errors < rm->rm_firstdatacol);
+/*
+ * Should this sector be considered failed for logical child ID i?
+ * XXX comment explaining logical child ID's
+ */
+static boolean_t
+raidz_simulate_failure(vdev_raidz_t *vdrz, int ashift, int i, raidz_col_t *rc)
+{
+	uint64_t sector_id =
+	    vdrz->vd_physical_width * (rc->rc_offset >> ashift) +
+	    rc->rc_devidx;
+
+#if 0
+	zfs_dbgmsg("raidz_simulate_failure(pw=%u lw=%u ashift=%u i=%u rc_offset=%llx rc_devidx=%u sector_id=%u",
+	    vdrz->vd_physical_width,
+	    vdrz->vd_logical_width,
+	    ashift,
+	    i,
+	    (long long)rc->rc_offset,
+	    (int)rc->rc_devidx,
+	    (long long)sector_id);
+#endif
 
-	/*
-	 * This simplifies one edge condition.
-	 */
-	tgts[-1] = -1;
+	for (int w = vdrz->vd_physical_width;
+	    w >= vdrz->vd_logical_width; w--) {
+		if (i < w) {
+			return (sector_id % w == i);
+		} else {
+			i -= w;
+		}
+	}
+	ASSERT(!"invalid logical child id");
+	return (B_FALSE);
+}
 
-	for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
-		/*
-		 * Initialize the targets array by finding the first n columns
-		 * that contain no error.
-		 *
-		 * If there were no data errors, we need to ensure that we're
-		 * always explicitly attempting to reconstruct at least one
-		 * data column. To do this, we simply push the highest target
-		 * up into the data columns.
-		 */
-		for (c = 0, i = 0; i < n; i++) {
-			if (i == n - 1 && data_errors == 0 &&
-			    c < rm->rm_firstdatacol) {
-				c = rm->rm_firstdatacol;
+static void
+raidz_restore_orig_data(raidz_map_t *rm)
+{
+	for (int i = 0; i < rm->rm_nrows; i++) {
+		raidz_row_t *rr = rm->rm_row[i];
+		for (int c = 0; c < rr->rr_cols; c++) {
+			raidz_col_t *rc = &rr->rr_col[c];
+			if (rc->rc_need_orig_restore) {
+				abd_copy_from_buf(rc->rc_abd,
+				    rc->rc_orig_data, rc->rc_size);
+				rc->rc_need_orig_restore = B_FALSE;
 			}
+		}
+	}
+}
 
-			while (rm->rm_col[c].rc_error != 0) {
-				c++;
-				ASSERT3S(c, <, rm->rm_cols);
+/*
+ * returns EINVAL if reconstruction of the block will not be possible
+ * returns ECKSUM if this specific reconstruction failed
+ * returns 0 on successful reconstruction
+ */
+static int
+raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts)
+{
+	raidz_map_t *rm = zio->io_vsd;
+	vdev_raidz_t *vdrz = zio->io_vd->vdev_tsd;
+
+	zfs_dbgmsg("raidz_reconstruct_expanded(zio=%p ltgts=%u,%u,%u ntgts=%u",
+	    zio, ltgts[0], ltgts[1], ltgts[2], ntgts);
+
+	/* Reconstruct each row */
+	for (int r = 0; r < rm->rm_nrows; r++) {
+		raidz_row_t *rr = rm->rm_row[r];
+		int my_tgts[VDEV_RAIDZ_MAXPARITY]; /* value is child id */
+		int t = 0;
+		int dead = 0;
+		int dead_data = 0;
+
+		zfs_dbgmsg("raidz_reconstruct_expanded(row=%u)",
+		    r);
+
+		for (int c = 0; c < rr->rr_cols; c++) {
+			raidz_col_t *rc = &rr->rr_col[c];
+			ASSERT0(rc->rc_need_orig_restore);
+			if (rc->rc_error != 0) {
+				dead++;
+				if (c >= vdrz->vd_nparity)
+					dead_data++;
+				continue;
+			}
+			if (rc->rc_size == 0)
+				continue;
+			for (int lt = 0; lt < ntgts; lt++) {
+				if (raidz_simulate_failure(vdrz,
+				    zio->io_vd->vdev_top->vdev_ashift,
+				    ltgts[lt], rc)) {
+					if (rc->rc_orig_data == NULL) {
+						rc->rc_orig_data =
+						    zio_buf_alloc(rc->rc_size);
+						abd_copy_to_buf(rc->rc_orig_data,
+						    rc->rc_abd, rc->rc_size);
+					}
+					rc->rc_need_orig_restore = B_TRUE;
+
+					dead++;
+					if (c >= vdrz->vd_nparity)
+						dead_data++;
+					my_tgts[t++] = c;
+					zfs_dbgmsg("simulating failure of col %u devidx %u",
+					    c, (int)rc->rc_devidx);
+					break;
+				}
 			}
-
-			tgts[i] = c++;
 		}
-
-		/*
-		 * Setting tgts[n] simplifies the other edge condition.
-		 */
-		tgts[n] = rm->rm_cols;
-
-		/*
-		 * These buffers were allocated in previous iterations.
-		 */
-		for (i = 0; i < n - 1; i++) {
-			ASSERT(orig[i] != NULL);
+		if (dead > vdrz->vd_nparity) {
+			/* reconstruction not possible */
+			zfs_dbgmsg("reconstruction not possible; too many failures");
+			raidz_restore_orig_data(rm);
+			return (EINVAL);
 		}
+		rr->rr_code = 0;
+		if (dead_data > 0)
+			rr->rr_code = vdev_raidz_reconstruct_row(rr, my_tgts, t);
+	}
 
-		orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
+	/* Check for success */
+	if (raidz_checksum_verify(zio) == 0) {
+
+		/* Reconstruction succeeded - report errors */
+		for (int i = 0; i < rm->rm_nrows; i++) {
+			raidz_row_t *rr = rm->rm_row[i];
+			atomic_inc_64(&raidz_corrected[rr->rr_code]);
+
+			for (int c = 0; c < rr->rr_cols; c++) {
+				raidz_col_t *rc = &rr->rr_col[c];
+				if (rc->rc_need_orig_restore) {
+					/*
+					 * Note: if this is a parity column,
+					 * we don't really know if it's wrong.
+					 * We need to let
+					 * vdev_raidz_io_done_verified() check
+					 * it, and if we set rc_error, it will
+					 * think that it is a "known" error
+					 * that doesn't need to be checked
+					 * or corrected.
+					 */
+					if (rc->rc_error == 0 && c >= rr->rr_firstdatacol) {
+						raidz_checksum_error(zio, rc, rc->rc_gdata);
+						rc->rc_error = SET_ERROR(ECKSUM);
+					}
+					rc->rc_need_orig_restore = B_FALSE;
+				}
+			}
 
-		current = 0;
-		next = tgts[current];
+			vdev_raidz_io_done_verified(zio, rr);
+		}
 
-		while (current != n) {
-			tgts[current] = next;
-			current = 0;
+		zio_checksum_verified(zio);
 
-			/*
-			 * Save off the original data that we're going to
-			 * attempt to reconstruct.
-			 */
-			for (i = 0; i < n; i++) {
-				ASSERT(orig[i] != NULL);
-				c = tgts[i];
-				ASSERT3S(c, >=, 0);
-				ASSERT3S(c, <, rm->rm_cols);
-				rc = &rm->rm_col[c];
-				abd_copy_to_buf(orig[i], rc->rc_abd,
-				    rc->rc_size);
-			}
+		zfs_dbgmsg("reconstruction successful (checksum verified)");
+		return (0);
+	}
 
-			/*
-			 * Attempt a reconstruction and exit the outer loop on
-			 * success.
-			 */
-			code = vdev_raidz_reconstruct(rm, tgts, n);
-			if (raidz_checksum_verify(zio) == 0) {
-				atomic_inc_64(&raidz_corrected[code]);
-
-				for (i = 0; i < n; i++) {
-					c = tgts[i];
-					rc = &rm->rm_col[c];
-					ASSERT(rc->rc_error == 0);
-					if (rc->rc_tried)
-						raidz_checksum_error(zio, rc,
-						    orig[i]);
-					rc->rc_error = SET_ERROR(ECKSUM);
-				}
+	/* Reconstruction failed - restore original data */
+	raidz_restore_orig_data(rm);
+	zfs_dbgmsg("raidz_reconstruct_expanded(zio=%p) checksum failed",
+	    zio);
+	return (ECKSUM);
+}
 
-				ret = code;
-				goto done;
-			}
+/*
+ * return 0 on success, ECKSUM on failure
+ */
+static int
+vdev_raidz_combrec(zio_t *zio)
+{
+	raidz_map_t *rm = zio->io_vsd;
+	vdev_raidz_t *vdrz = zio->io_vd->vdev_tsd;
+
+	for (int num_failures = 1; num_failures <= vdrz->vd_nparity;
+	    num_failures++) {
+		int tstore[VDEV_RAIDZ_MAXPARITY + 2];
+		int *ltgts = &tstore[1]; /* value is logical child ID */
+
+		/* Determine number of logical children, n */
+		int n = 0;
+		for (int w = vdrz->vd_physical_width;
+		    w >= vdrz->vd_logical_width; w--) {
+			n += w;
+		}
 
-			/*
-			 * Restore the original data.
-			 */
-			for (i = 0; i < n; i++) {
-				c = tgts[i];
-				rc = &rm->rm_col[c];
-				abd_copy_from_buf(rc->rc_abd, orig[i],
-				    rc->rc_size);
-			}
+		ASSERT3U(num_failures, <=, vdrz->vd_nparity);
+		ASSERT3U(num_failures, <=, VDEV_RAIDZ_MAXPARITY);
+		/* handle corner cases in combrec logic */
+		ltgts[-1] = -1;
+		for (int i = 0; i < num_failures; i++) {
+			ltgts[i] = i;
+		}
+		ltgts[num_failures] = n;
 
-			do {
+		for (;;) {
+			int err = raidz_reconstruct(zio,
+			    ltgts, num_failures);
+			if (err == EINVAL) {
 				/*
-				 * Find the next valid column after the current
-				 * position..
+				 * Reconstruction not possible with this #
+				 * failures; try more failures.
 				 */
-				for (next = tgts[current] + 1;
-				    next < rm->rm_cols &&
-				    rm->rm_col[next].rc_error != 0; next++)
-					continue;
+				break;
+			} else if (err == 0)
+				return (0);
+
+			/* Compute next targets to try */
+			for (int t = 0; ; t++) {
+				ASSERT3U(t, <, num_failures);
+				ltgts[t]++;
+				if (ltgts[t] == n) {
+					ASSERT3U(t, ==, num_failures - 1);
+					zfs_dbgmsg("reconstruction failed for num_failures=%u; tried all combinations",
+					    num_failures);
+					break; // try more failures
+				}
 
-				ASSERT(next <= tgts[current + 1]);
+				ASSERT3U(ltgts[t], <, n);
+				ASSERT3U(ltgts[t], <=, ltgts[t + 1]);
 
 				/*
 				 * If that spot is available, we're done here.
 				 */
-				if (next != tgts[current + 1])
-					break;
+				if (ltgts[t] != ltgts[t + 1])
+					break; // found next combination
 
 				/*
-				 * Otherwise, find the next valid column after
-				 * the previous position.
+				 * Otherwise, reset this tgt to the minimum,
+				 * and move on to the next tgt.
 				 */
-				for (c = tgts[current - 1] + 1;
-				    rm->rm_col[c].rc_error != 0; c++)
-					continue;
-
-				tgts[current] = c;
-				current++;
-
-			} while (current != n);
+				ltgts[t] = ltgts[t - 1] + 1;
+				ASSERT3U(ltgts[t], ==, t);
+			}
+			if (ltgts[num_failures - 1] == n)
+				break; // try more failures
 		}
 	}
-	n--;
-done:
-	for (i = 0; i < n; i++) {
-		zio_buf_free(orig[i], rm->rm_col[0].rc_size);
-	}
-
-	return (ret);
+	zfs_dbgmsg("reconstruction failed for all num_failures");
+	return (ECKSUM);
 }
 
 /*
- * Complete an IO operation on a RAIDZ VDev
+ * Complete a write IO operation on a RAIDZ VDev
  *
  * Outline:
- * - For write operations:
  *   1. Check for errors on the child IOs.
  *   2. Return, setting an error code if too few child VDevs were written
  *      to reconstruct the data later.  Note that partial writes are
  *      considered successful if they can be reconstructed at all.
- * - For read operations:
- *   1. Check for errors on the child IOs.
- *   2. If data errors occurred:
- *      a. Try to reassemble the data from the parity available.
- *      b. If we haven't yet read the parity drives, read them now.
- *      c. If all parity drives have been read but the data still doesn't
- *         reassemble with a correct checksum, then try combinatorial
- *         reconstruction.
- *      d. If that doesn't work, return an error.
- *   3. If there were unexpected errors or this is a resilver operation,
- *      rewrite the vdevs that had errors.
  */
 static void
-vdev_raidz_io_done(zio_t *zio)
+vdev_raidz_io_done_write_impl(zio_t *zio, raidz_row_t *rr)
+{
+	int total_errors = 0;
+
+	ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol);
+	ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol);
+	ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
+
+	for (int c = 0; c < rr->rr_cols; c++) {
+		raidz_col_t *rc = &rr->rr_col[c];
+
+		if (rc->rc_error) {
+			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
+
+			total_errors++;
+		}
+	}
+
+	/*
+	 * XXX -- for now, treat partial writes as a success.
+	 * (If we couldn't write enough columns to reconstruct
+	 * the data, the I/O failed.  Otherwise, good enough.)
+	 *
+	 * Now that we support write reallocation, it would be better
+	 * to treat partial failure as real failure unless there are
+	 * no non-degraded top-level vdevs left, and not update DTLs
+	 * if we intend to reallocate.
+	 */
+	/* XXPOLICY */
+	if (total_errors > rr->rr_firstdatacol) {
+		zio->io_error = zio_worst_error(zio->io_error,
+		    vdev_raidz_worst_error(rr));
+	}
+}
+
+/*
+ * return 0 if no reconstruction occurred, otherwise the "code" from
+ * vdev_raidz_reconstruct().
+ */
+static int
+vdev_raidz_io_done_reconstruct_known_missing(zio_t *zio, raidz_row_t *rr)
 {
-	vdev_t *vd = zio->io_vd;
-	vdev_t *cvd;
-	raidz_map_t *rm = zio->io_vsd;
-	raidz_col_t *rc;
-	int unexpected_errors = 0;
 	int parity_errors = 0;
 	int parity_untried = 0;
 	int data_errors = 0;
 	int total_errors = 0;
-	int n, c;
-	int tgts[VDEV_RAIDZ_MAXPARITY];
-	int code;
-
-	ASSERT(zio->io_bp != NULL);  /* XXX need to add code to enforce this */
+	int code = 0;
 
-	ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
-	ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
+	ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol);
+	ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol);
+	ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
 
-	for (c = 0; c < rm->rm_cols; c++) {
-		rc = &rm->rm_col[c];
+	for (int c = 0; c < rr->rr_cols; c++) {
+		raidz_col_t *rc = &rr->rr_col[c];
 
 		if (rc->rc_error) {
 			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
 
-			if (c < rm->rm_firstdatacol)
+			if (c < rr->rr_firstdatacol)
 				parity_errors++;
 			else
 				data_errors++;
 
-			if (!rc->rc_skipped)
-				unexpected_errors++;
-
 			total_errors++;
-		} else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
+		} else if (c < rr->rr_firstdatacol && !rc->rc_tried) {
 			parity_untried++;
 		}
 	}
 
-	if (zio->io_type == ZIO_TYPE_WRITE) {
+	/*
+	 * If there were data errors and the number of errors we saw was
+	 * correctable -- less than or equal to the number of parity disks read
+	 * -- reconstruct based on the missing data.
+	 */
+	if (data_errors != 0 &&
+	    total_errors <= rr->rr_firstdatacol - parity_untried) {
 		/*
-		 * XXX -- for now, treat partial writes as a success.
-		 * (If we couldn't write enough columns to reconstruct
-		 * the data, the I/O failed.  Otherwise, good enough.)
-		 *
-		 * Now that we support write reallocation, it would be better
-		 * to treat partial failure as real failure unless there are
-		 * no non-degraded top-level vdevs left, and not update DTLs
-		 * if we intend to reallocate.
+		 * We either attempt to read all the parity columns or
+		 * none of them. If we didn't try to read parity, we
+		 * wouldn't be here in the correctable case. There must
+		 * also have been fewer parity errors than parity
+		 * columns or, again, we wouldn't be in this code path.
 		 */
-		/* XXPOLICY */
-		if (total_errors > rm->rm_firstdatacol)
-			zio->io_error = vdev_raidz_worst_error(rm);
+		ASSERT(parity_untried == 0);
+		ASSERT(parity_errors < rr->rr_firstdatacol);
 
-		return;
-	} else if (zio->io_type == ZIO_TYPE_FREE) {
-		return;
+		/*
+		 * Identify the data columns that reported an error.
+		 */
+		int n = 0;
+		int tgts[VDEV_RAIDZ_MAXPARITY];
+		for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
+			raidz_col_t *rc = &rr->rr_col[c];
+			if (rc->rc_error != 0) {
+				ASSERT(n < VDEV_RAIDZ_MAXPARITY);
+				tgts[n++] = c;
+			}
+		}
+
+		ASSERT(rr->rr_firstdatacol >= n);
+
+		code = vdev_raidz_reconstruct_row(rr, tgts, n);
 	}
 
-	ASSERT(zio->io_type == ZIO_TYPE_READ);
-	/*
-	 * There are three potential phases for a read:
-	 *	1. produce valid data from the columns read
-	 *	2. read all disks and try again
-	 *	3. perform combinatorial reconstruction
-	 *
-	 * Each phase is progressively both more expensive and less likely to
-	 * occur. If we encounter more errors than we can repair or all phases
-	 * fail, we have no choice but to return an error.
-	 */
+	return (code);
+}
 
-	/*
-	 * If the number of errors we saw was correctable -- less than or equal
-	 * to the number of parity disks read -- attempt to produce data that
-	 * has a valid checksum. Naturally, this case applies in the absence of
-	 * any errors.
-	 */
-	if (total_errors <= rm->rm_firstdatacol - parity_untried) {
-		if (data_errors == 0) {
-			if (raidz_checksum_verify(zio) == 0) {
-				/*
-				 * If we read parity information (unnecessarily
-				 * as it happens since no reconstruction was
-				 * needed) regenerate and verify the parity.
-				 * We also regenerate parity when resilvering
-				 * so we can write it out to the failed device
-				 * later.
-				 */
-				if (parity_errors + parity_untried <
-				    rm->rm_firstdatacol ||
-				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
-					n = raidz_parity_verify(zio, rm);
-					unexpected_errors += n;
-					ASSERT(parity_errors + n <=
-					    rm->rm_firstdatacol);
-				}
-				goto done;
+/*
+ * return the number of reads issued.
+ */
+static int
+vdev_raidz_read_all(zio_t *zio, raidz_row_t *rr)
+{
+	vdev_t *vd = zio->io_vd;
+	int nread = 0;
+
+	rr->rr_missingdata = 0;
+	rr->rr_missingparity = 0;
+
+	for (int c = 0; c < rr->rr_cols; c++) {
+		raidz_col_t *rc = &rr->rr_col[c];
+		if (rc->rc_tried || rc->rc_size == 0)
+			continue;
+
+		zio_nowait(zio_vdev_child_io(zio, NULL,
+		    vd->vdev_child[rc->rc_devidx],
+		    rc->rc_offset, rc->rc_abd, rc->rc_size,
+		    zio->io_type, zio->io_priority, 0,
+		    vdev_raidz_child_done, rc));
+		nread++;
+	}
+	return (nread);
+}
+
+static void
+vdev_raidz_io_done(zio_t *zio)
+{
+	raidz_map_t *rm = zio->io_vsd;
+	vdev_raidz_t *vdrz = zio->io_vd->vdev_tsd;
+
+	ASSERT(zio->io_bp != NULL);  /* XXX need to add code to enforce this */
+	if (zio->io_type == ZIO_TYPE_WRITE) {
+		for (int i = 0; i < rm->rm_nrows; i++) {
+			vdev_raidz_io_done_write_impl(zio, rm->rm_row[i]);
+		}
+	} else if (zio->io_type == ZIO_TYPE_FREE) {
+		return;
+	} else {
+		for (int i = 0; i < rm->rm_nrows; i++) {
+			raidz_row_t *rr = rm->rm_row[i];
+			rr->rr_code =
+			    vdev_raidz_io_done_reconstruct_known_missing(zio,
+			    rr);
+		}
+
+		if (raidz_checksum_verify(zio) == 0) {
+			for (int i = 0; i < rm->rm_nrows; i++) {
+				raidz_row_t *rr = rm->rm_row[i];
+				atomic_inc_64(&raidz_corrected[rr->rr_code]);
+				vdev_raidz_io_done_verified(zio, rr);
 			}
+			zio_checksum_verified(zio);
 		} else {
 			/*
-			 * We either attempt to read all the parity columns or
-			 * none of them. If we didn't try to read parity, we
-			 * wouldn't be here in the correctable case. There must
-			 * also have been fewer parity errors than parity
-			 * columns or, again, we wouldn't be in this code path.
+			 * This isn't a typical situation -- either we got a
+			 * read error or a child silently returned bad data.
+			 * Read every block so we can try again with as much
+			 * data and parity as we can track down. If we've
+			 * already been through once before, all children will
+			 * be marked as tried so we'll proceed to combinatorial
+			 * reconstruction.
 			 */
-			ASSERT(parity_untried == 0);
-			ASSERT(parity_errors < rm->rm_firstdatacol);
-
+			int nread = 0;
+			for (int i = 0; i < rm->rm_nrows; i++) {
+				nread += vdev_raidz_read_all(zio,
+				    rm->rm_row[i]);
+			}
+			if (nread != 0) {
+				/*
+				 * Normally our stage is VDEV_IO_DONE, but if
+				 * we've already called redone(), it will have
+				 * changed to VDEV_IO_START, in which case we
+				 * don't want to call redone() again.
+				 */
+				if (zio->io_stage != ZIO_STAGE_VDEV_IO_START)
+					zio_vdev_io_redone(zio);
+				return;
+			}
 			/*
-			 * Identify the data columns that reported an error.
+			 * It would be too expensive to try every possible
+			 * combination of failed sectors in every row, so
+			 * instead we try every combination of failed current or
+			 * past physical disk. This means that if the incorrect
+			 * sectors were all on Nparity disks at any point in the
+			 * past, we will find the correct data.  I think that
+			 * the only case where this is less durable than
+			 * a non-expanded RAIDZ, is if we have a silent
+			 * failure during expansion.  In that case, one block
+			 * could be partially in the old format and partially
+			 * in the new format, so we'd lost some sectors
+			 * from the old format and some from the new format.
+			 *
+			 * e.g. logical_width=4 physical_width=6
+			 * the 15 (6+5+4) possible failed disks are:
+			 * width=6 child=0
+			 * width=6 child=1
+			 * width=6 child=2
+			 * width=6 child=3
+			 * width=6 child=4
+			 * width=6 child=5
+			 * width=5 child=0
+			 * width=5 child=1
+			 * width=5 child=2
+			 * width=5 child=3
+			 * width=5 child=4
+			 * width=4 child=0
+			 * width=4 child=1
+			 * width=4 child=2
+			 * width=4 child=3
+			 * And we will try every combination of Nparity of these
+			 * failing.
+			 *
+			 * As a first pass, we can generate every combo,
+			 * and try reconstructing, ignoring any known
+			 * failures.  If any row has too many known + simulated
+			 * failures, then we bail on reconstructing with this
+			 * number of simulated failures.  As an improvement,
+			 * we could detect the number of whole known failures
+			 * (i.e. we have known failures on these disks for
+			 * every row; the disks never succeeded), and
+			 * subtract that from the max # failures to simulate.
+			 * We could go even further like the current
+			 * combrec code, but that doesn't seem like it
+			 * gains us very much.  If we simulate a failure
+			 * that is also a known failure, that's fine.
 			 */
-			n = 0;
-			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
-				rc = &rm->rm_col[c];
-				if (rc->rc_error != 0) {
-					ASSERT(n < VDEV_RAIDZ_MAXPARITY);
-					tgts[n++] = c;
-				}
-			}
-
-			ASSERT(rm->rm_firstdatacol >= n);
-
-			code = vdev_raidz_reconstruct(rm, tgts, n);
-
-			if (raidz_checksum_verify(zio) == 0) {
-				atomic_inc_64(&raidz_corrected[code]);
-
+			if (vdev_raidz_combrec(zio) != 0) {
 				/*
-				 * If we read more parity disks than were used
-				 * for reconstruction, confirm that the other
-				 * parity disks produced correct data. This
-				 * routine is suboptimal in that it regenerates
-				 * the parity that we already used in addition
-				 * to the parity that we're attempting to
-				 * verify, but this should be a relatively
-				 * uncommon case, and can be optimized if it
-				 * becomes a problem. Note that we regenerate
-				 * parity when resilvering so we can write it
-				 * out to failed devices later.
+				 * We're here because either:
+				 *
+				 *	total_errors == rm_first_datacol, or
+				 *	vdev_raidz_combrec() failed
+				 *
+				 * In either case, there is enough bad data to prevent
+				 * reconstruction.
+				 *
+				 * Start checksum ereports for all children which haven't
+				 * failed, and the IO wasn't speculative.
 				 */
-				if (parity_errors < rm->rm_firstdatacol - n ||
-				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
-					n = raidz_parity_verify(zio, rm);
-					unexpected_errors += n;
-					ASSERT(parity_errors + n <=
-					    rm->rm_firstdatacol);
+				zio->io_error = SET_ERROR(ECKSUM);
+
+				if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
+					for (int i = 0; i < rm->rm_nrows; i++) {
+						raidz_row_t *rr = rm->rm_row[i];
+						for (int c = 0; c < rr->rr_cols; c++) {
+							raidz_col_t *rc = &rr->rr_col[c];
+							if (rc->rc_error == 0) {
+								zio_bad_cksum_t zbc;
+								zbc.zbc_has_cksum = 0;
+								zbc.zbc_injected =
+								    rm->rm_ecksuminjected;
+
+								zfs_ereport_start_checksum(
+								    zio->io_spa,
+								    zio->io_vd->vdev_child[rc->rc_devidx],
+								    zio, rc->rc_offset, rc->rc_size,
+								    (void *)(uintptr_t)c, &zbc);
+							}
+						}
+					}
 				}
-
-				goto done;
 			}
 		}
 	}
+	ASSERT(!vdrz->vn_expanding);
+}
+
+static void
+vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
+{
+	vdev_raidz_t *vdrz = vd->vdev_tsd;
+	if (faulted > vdrz->vd_nparity)
+		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+		    VDEV_AUX_NO_REPLICAS);
+	else if (degraded + faulted != 0)
+		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
+	else
+		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
+}
+
+static void
+raidz_copy_range(void *arg, uint64_t start, uint64_t size)
+{
+	vdev_t *vd = arg;
+	int ashift = vd->vdev_top->vdev_ashift;
+	int old_children = vd->vdev_children - 1;
+	spa_t *spa = vd->vdev_spa;
+
+	ASSERT(IS_P2ALIGNED(start, 1 << ashift));
+	ASSERT(IS_P2ALIGNED(size, 1 << ashift));
+
+	abd_t *abd = abd_alloc_for_io(1 << ashift, B_FALSE);
+	for (uint64_t i = MAX(start >> ashift, old_children);
+	    i < (start + size) >> ashift; i++) {
+		int child = i % old_children;
+		int offset = (i / old_children) << ashift;
+		spa_config_enter(spa, SCL_STATE, spa, RW_READER);
+		VERIFY0(zio_wait(zio_read_phys(NULL,
+		    vd->vdev_child[child],
+		    offset + VDEV_LABEL_START_SIZE,
+		    1 << ashift, abd,
+		    ZIO_CHECKSUM_OFF, NULL, NULL,
+		    ZIO_PRIORITY_REMOVAL, 0, B_FALSE)));
+
+		child = i % vd->vdev_children;
+		offset = (i / vd->vdev_children) << ashift;
+		VERIFY0(zio_wait(zio_write_phys(NULL,
+		    vd->vdev_child[child],
+		    offset + VDEV_LABEL_START_SIZE,
+		    1 << ashift, abd,
+		    ZIO_CHECKSUM_OFF, NULL, NULL,
+		    ZIO_PRIORITY_REMOVAL, 0, B_FALSE)));
+		spa_config_exit(spa, SCL_STATE, spa);
+	}
+	abd_free(abd);
+}
+
+void
+vdev_raidz_attach_sync(void *arg, dmu_tx_t *tx)
+{
+	vdev_t *new_child = arg;
+	spa_t *spa = new_child->vdev_spa;
+	vdev_t *raidvd = new_child->vdev_parent;
+	vdev_raidz_t *vdrz = raidvd->vdev_tsd;
+	ASSERT3P(raidvd->vdev_ops, ==, &vdev_raidz_ops);
+	ASSERT3P(raidvd->vdev_top, ==, raidvd);
+	ASSERT3U(raidvd->vdev_children, >, vdrz->vd_logical_width);
+	ASSERT3U(raidvd->vdev_children, ==, vdrz->vd_physical_width + 1);
 
 	/*
-	 * This isn't a typical situation -- either we got a read error or
-	 * a child silently returned bad data. Read every block so we can
-	 * try again with as much data and parity as we can track down. If
-	 * we've already been through once before, all children will be marked
-	 * as tried so we'll proceed to combinatorial reconstruction.
+	 * XXX assuming that no other i/o takes place while this is happening,
+	 * until we increment physical_width. But ZIL could do i/o.
 	 */
-	unexpected_errors = 1;
-	rm->rm_missingdata = 0;
-	rm->rm_missingparity = 0;
+	vdrz->vn_expanding = B_TRUE;
 
-	for (c = 0; c < rm->rm_cols; c++) {
-		if (rm->rm_col[c].rc_tried)
-			continue;
+	/*spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);*/
 
-		zio_vdev_io_redone(zio);
-		do {
-			rc = &rm->rm_col[c];
-			if (rc->rc_tried)
-				continue;
-			zio_nowait(zio_vdev_child_io(zio, NULL,
-			    vd->vdev_child[rc->rc_devidx],
-			    rc->rc_offset, rc->rc_abd, rc->rc_size,
-			    zio->io_type, zio->io_priority, 0,
-			    vdev_raidz_child_done, rc));
-		} while (++c < rm->rm_cols);
+	range_tree_t *rt = range_tree_create(NULL, NULL);
 
-		return;
+	for (uint64_t i = 0; i < raidvd->vdev_ms_count; i++) {
+		metaslab_t *msp = raidvd->vdev_ms[i];
+
+		/*vdev_initialize_ms_mark(msp);*/
+		mutex_enter(&msp->ms_lock);
+
+		metaslab_load_wait(msp);
+		if (!msp->ms_loaded)
+			VERIFY0(metaslab_load(msp));
+
+		/*
+		 * We want to copy everything except the free (allocatable)
+		 * space.  Note that there may be a little bit more free
+		 * space (e.g. in ms_defer), and it's fine to copy that too.
+		 */
+		ASSERT(range_tree_is_empty(rt));
+		range_tree_add(rt, msp->ms_start, msp->ms_size);
+		range_tree_walk(msp->ms_allocatable, range_tree_remove, rt);
+		mutex_exit(&msp->ms_lock);
+
+		/*spa_config_exit(spa, SCL_CONFIG, FTAG);*/
+		/* Note, _vacate() doesn't visit in order */
+		range_tree_walk(rt, raidz_copy_range, raidvd);
+		range_tree_vacate(rt, NULL, NULL);
+		/*vdev_initialize_ms_unmark(msp);*/
+		/*spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);*/
 	}
 
+	/*spa_config_exit(spa, SCL_CONFIG, FTAG);*/
+	range_tree_destroy(rt);
+
+	vdrz->vd_physical_width++;
+
+#if 0
+	raidvd->vdev_expanding = B_TRUE;
+	vdev_reopen(raidvd);
+	raidvd->vdev_expanding = B_FALSE;
+#endif
+
+	vdrz->vn_expanding = B_FALSE;
+	/* Ensure that widths get written to label config */
+	vdev_config_dirty(raidvd);
+}
+
+/*
+ * Add RAIDZ-specific fields to the config nvlist.
+ * XXX add this to vdev_ops_t?
+ */
+void
+vdev_raidz_config_generate(vdev_t *vd, nvlist_t *nv)
+{
+	spa_t *spa = vd->vdev_spa;
+	ASSERT3P(vd->vdev_ops, ==, &vdev_raidz_ops);
+	vdev_raidz_t *vdrz = vd->vdev_tsd;
+
+	/*
+	 * Make sure someone hasn't managed to sneak a fancy new vdev
+	 * into a crufty old storage pool.
+	 */
+	ASSERT(vdrz->vd_nparity == 1 ||
+	    (vdrz->vd_nparity <= 2 &&
+	    spa_version(spa) >= SPA_VERSION_RAIDZ2) ||
+	    (vdrz->vd_nparity <= 3 &&
+	    spa_version(spa) >= SPA_VERSION_RAIDZ3));
+
 	/*
-	 * At this point we've attempted to reconstruct the data given the
-	 * errors we detected, and we've attempted to read all columns. There
-	 * must, therefore, be one or more additional problems -- silent errors
-	 * resulting in invalid data rather than explicit I/O errors resulting
-	 * in absent data. We check if there is enough additional data to
-	 * possibly reconstruct the data and then perform combinatorial
-	 * reconstruction over all possible combinations. If that fails,
-	 * we're cooked.
+	 * Note that we'll add these even on storage pools where they
+	 * aren't strictly required -- older software will just ignore
+	 * it.
 	 */
-	if (total_errors > rm->rm_firstdatacol) {
-		zio->io_error = vdev_raidz_worst_error(rm);
+	fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vdrz->vd_nparity);
+	fnvlist_add_uint64(nv, ZPOOL_CONFIG_RAIDZ_LOGICAL_WIDTH,
+	    vdrz->vd_logical_width);
+}
+
+/*
+ * Set RAIDZ-specific fields in the vdev_t, based on the config.
+ * Can't assume that anything about the vdev_t is already set.
+ * XXX add this to vdev_ops_t?
+ */
+void *
+vdev_raidz_get_tsd(spa_t *spa, nvlist_t *nv)
+{
+	uint64_t nparity, lw;
+	vdev_raidz_t *vdrz = kmem_zalloc(sizeof (*vdrz), KM_SLEEP);
+
+	uint_t children;
+	nvlist_t **child;
+	int error = nvlist_lookup_nvlist_array(nv,
+	    ZPOOL_CONFIG_CHILDREN, &child, &children);
+	if (error != 0)
+		goto out;
+
+	vdrz->vd_logical_width = children;
+	vdrz->vd_physical_width = children;
+
+	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RAIDZ_LOGICAL_WIDTH,
+	    &lw) == 0) {
+		vdrz->vd_logical_width = lw;
+	}
 
-	} else if (total_errors < rm->rm_firstdatacol &&
-	    (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
+	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY,
+	    &nparity) == 0) {
+		if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY)
+			goto out;
 		/*
-		 * If we didn't use all the available parity for the
-		 * combinatorial reconstruction, verify that the remaining
-		 * parity is correct.
+		 * Previous versions could only support 1 or 2 parity
+		 * device.
 		 */
-		if (code != (1 << rm->rm_firstdatacol) - 1)
-			(void) raidz_parity_verify(zio, rm);
+		if (nparity > 1 &&
+		    spa_version(spa) < SPA_VERSION_RAIDZ2)
+			goto out;
+		if (nparity > 2 &&
+		    spa_version(spa) < SPA_VERSION_RAIDZ3)
+			goto out;
 	} else {
 		/*
-		 * We're here because either:
-		 *
-		 *	total_errors == rm_first_datacol, or
-		 *	vdev_raidz_combrec() failed
-		 *
-		 * In either case, there is enough bad data to prevent
-		 * reconstruction.
-		 *
-		 * Start checksum ereports for all children which haven't
-		 * failed, and the IO wasn't speculative.
+		 * We require the parity to be specified for SPAs that
+		 * support multiple parity levels.
 		 */
-		zio->io_error = SET_ERROR(ECKSUM);
-
-		if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
-			for (c = 0; c < rm->rm_cols; c++) {
-				rc = &rm->rm_col[c];
-				if (rc->rc_error == 0) {
-					zio_bad_cksum_t zbc;
-					zbc.zbc_has_cksum = 0;
-					zbc.zbc_injected =
-					    rm->rm_ecksuminjected;
-
-					zfs_ereport_start_checksum(
-					    zio->io_spa,
-					    vd->vdev_child[rc->rc_devidx],
-					    zio, rc->rc_offset, rc->rc_size,
-					    (void *)(uintptr_t)c, &zbc);
-				}
-			}
-		}
-	}
-
-done:
-	zio_checksum_verified(zio);
-
-	if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
-	    (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
+		if (spa_version(spa) >= SPA_VERSION_RAIDZ2)
+			goto out;
 		/*
-		 * Use the good data we have in hand to repair damaged children.
+		 * Otherwise, we default to 1 parity device for RAID-Z.
 		 */
-		for (c = 0; c < rm->rm_cols; c++) {
-			rc = &rm->rm_col[c];
-			cvd = vd->vdev_child[rc->rc_devidx];
-
-			if (rc->rc_error == 0)
-				continue;
-
-			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
-			    rc->rc_offset, rc->rc_abd, rc->rc_size,
-			    ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
-			    ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
-			    ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
-		}
+		nparity = 1;
 	}
-}
-
-static void
-vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
-{
-	if (faulted > vd->vdev_nparity)
-		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
-		    VDEV_AUX_NO_REPLICAS);
-	else if (degraded + faulted != 0)
-		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
-	else
-		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
+	vdrz->vd_nparity = nparity;
+	return (vdrz);
+out:
+	kmem_free(vdrz, sizeof (*vdrz));
+	return (NULL);
 }
 
 vdev_ops_t vdev_raidz_ops = {
Index: sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h
+++ sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h
@@ -556,6 +556,7 @@
 #define	ZPOOL_CONFIG_SPARES		"spares"
 #define	ZPOOL_CONFIG_IS_SPARE		"is_spare"
 #define	ZPOOL_CONFIG_NPARITY		"nparity"
+#define	ZPOOL_CONFIG_RAIDZ_LOGICAL_WIDTH "raidz_logical_width"
 #define	ZPOOL_CONFIG_HOSTID		"hostid"
 #define	ZPOOL_CONFIG_HOSTNAME		"hostname"
 #define	ZPOOL_CONFIG_LOADED_TIME	"initial_load_time"