Changeset View
Changeset View
Standalone View
Standalone View
head/sys/cddl/boot/zfs/zfssubr.c
Show First 20 Lines • Show All 57 Lines • ▼ Show 20 Lines | if (zfs_crc64_table[128] != ZFS_CRC64_POLY) { | ||||
memset(zfs_crc64_table, 0, sizeof(zfs_crc64_table)); | memset(zfs_crc64_table, 0, sizeof(zfs_crc64_table)); | ||||
for (i = 0; i < 256; i++) | for (i = 0; i < 256; i++) | ||||
for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) | for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) | ||||
*ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); | *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); | ||||
} | } | ||||
} | } | ||||
static void | static void | ||||
zio_checksum_off(const void *buf, uint64_t size, zio_cksum_t *zcp) | zio_checksum_off(const void *buf, uint64_t size, | ||||
const void *ctx_template, zio_cksum_t *zcp) | |||||
{ | { | ||||
ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0); | ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0); | ||||
} | } | ||||
/* | /* | ||||
* Signature for checksum functions. | * Signature for checksum functions. | ||||
*/ | */ | ||||
typedef void zio_checksum_t(const void *data, uint64_t size, zio_cksum_t *zcp); | typedef void zio_checksum_t(const void *data, uint64_t size, | ||||
const void *ctx_template, zio_cksum_t *zcp); | |||||
typedef void *zio_checksum_tmpl_init_t(const zio_cksum_salt_t *salt); | |||||
typedef void zio_checksum_tmpl_free_t(void *ctx_template); | |||||
typedef enum zio_checksum_flags { | |||||
/* Strong enough for metadata? */ | |||||
ZCHECKSUM_FLAG_METADATA = (1 << 1), | |||||
/* ZIO embedded checksum */ | |||||
ZCHECKSUM_FLAG_EMBEDDED = (1 << 2), | |||||
/* Strong enough for dedup (without verification)? */ | |||||
ZCHECKSUM_FLAG_DEDUP = (1 << 3), | |||||
/* Uses salt value */ | |||||
ZCHECKSUM_FLAG_SALTED = (1 << 4), | |||||
/* Strong enough for nopwrite? */ | |||||
ZCHECKSUM_FLAG_NOPWRITE = (1 << 5) | |||||
} zio_checksum_flags_t; | |||||
/* | /* | ||||
* Information about each checksum function. | * Information about each checksum function. | ||||
*/ | */ | ||||
typedef struct zio_checksum_info { | typedef struct zio_checksum_info { | ||||
zio_checksum_t *ci_func[2]; /* checksum function for each byteorder */ | /* checksum function for each byteorder */ | ||||
int ci_correctable; /* number of correctable bits */ | zio_checksum_t *ci_func[2]; | ||||
int ci_eck; /* uses zio embedded checksum? */ | zio_checksum_tmpl_init_t *ci_tmpl_init; | ||||
int ci_dedup; /* strong enough for dedup? */ | zio_checksum_tmpl_free_t *ci_tmpl_free; | ||||
zio_checksum_flags_t ci_flags; | |||||
const char *ci_name; /* descriptive name */ | const char *ci_name; /* descriptive name */ | ||||
} zio_checksum_info_t; | } zio_checksum_info_t; | ||||
#include "blkptr.c" | #include "blkptr.c" | ||||
#include "fletcher.c" | #include "fletcher.c" | ||||
#include "sha256.c" | #include "sha256.c" | ||||
#include "skein_zfs.c" | |||||
static zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = { | static zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = { | ||||
{{NULL, NULL}, 0, 0, 0, "inherit"}, | {{NULL, NULL}, NULL, NULL, 0, "inherit"}, | ||||
{{NULL, NULL}, 0, 0, 0, "on"}, | {{NULL, NULL}, NULL, NULL, 0, "on"}, | ||||
{{zio_checksum_off, zio_checksum_off}, 0, 0, 0, "off"}, | {{zio_checksum_off, zio_checksum_off}, NULL, NULL, 0, "off"}, | ||||
{{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 1, 0, "label"}, | {{zio_checksum_SHA256, zio_checksum_SHA256}, NULL, NULL, | ||||
{{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 1, 0, "gang_header"}, | ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "label"}, | ||||
{{fletcher_2_native, fletcher_2_byteswap}, 0, 1, 0, "zilog"}, | {{zio_checksum_SHA256, zio_checksum_SHA256}, NULL, NULL, | ||||
{{fletcher_2_native, fletcher_2_byteswap}, 0, 0, 0, "fletcher2"}, | ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "gang_header"}, | ||||
{{fletcher_4_native, fletcher_4_byteswap}, 1, 0, 0, "fletcher4"}, | {{fletcher_2_native, fletcher_2_byteswap}, NULL, NULL, | ||||
{{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 0, 1, "SHA256"}, | ZCHECKSUM_FLAG_EMBEDDED, "zilog"}, | ||||
{{fletcher_4_native, fletcher_4_byteswap}, 0, 1, 0, "zillog2"}, | {{fletcher_2_native, fletcher_2_byteswap}, NULL, NULL, | ||||
0, "fletcher2"}, | |||||
{{fletcher_4_native, fletcher_4_byteswap}, NULL, NULL, | |||||
ZCHECKSUM_FLAG_METADATA, "fletcher4"}, | |||||
{{zio_checksum_SHA256, zio_checksum_SHA256}, NULL, NULL, | |||||
ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | | |||||
ZCHECKSUM_FLAG_NOPWRITE, "SHA256"}, | |||||
{{fletcher_4_native, fletcher_4_byteswap}, NULL, NULL, | |||||
ZCHECKSUM_FLAG_EMBEDDED, "zillog2"}, | |||||
{{zio_checksum_off, zio_checksum_off}, NULL, NULL, | |||||
0, "noparity"}, | |||||
{{zio_checksum_SHA512_native, zio_checksum_SHA512_byteswap}, | |||||
NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | | |||||
ZCHECKSUM_FLAG_NOPWRITE, "SHA512"}, | |||||
{{zio_checksum_skein_native, zio_checksum_skein_byteswap}, | |||||
zio_checksum_skein_tmpl_init, zio_checksum_skein_tmpl_free, | |||||
ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | | |||||
ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "skein"}, | |||||
/* no edonr for now */ | |||||
{{NULL, NULL}, NULL, NULL, ZCHECKSUM_FLAG_METADATA | | |||||
ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "edonr"} | |||||
}; | }; | ||||
/* | /* | ||||
* Common signature for all zio compress/decompress functions. | * Common signature for all zio compress/decompress functions. | ||||
*/ | */ | ||||
typedef size_t zio_compress_func_t(void *src, void *dst, | typedef size_t zio_compress_func_t(void *src, void *dst, | ||||
size_t s_len, size_t d_len, int); | size_t s_len, size_t d_len, int); | ||||
typedef int zio_decompress_func_t(void *src, void *dst, | typedef int zio_decompress_func_t(void *src, void *dst, | ||||
size_t s_len, size_t d_len, int); | size_t s_len, size_t d_len, int); | ||||
▲ Show 20 Lines • Show All 67 Lines • ▼ Show 20 Lines | |||||
* hence the logic in vdev_uberblock_load() to find the most recent copy. | * hence the logic in vdev_uberblock_load() to find the most recent copy. | ||||
*/ | */ | ||||
static void | static void | ||||
zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset) | zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset) | ||||
{ | { | ||||
ZIO_SET_CHECKSUM(zcp, offset, 0, 0, 0); | ZIO_SET_CHECKSUM(zcp, offset, 0, 0, 0); | ||||
} | } | ||||
/* | |||||
* Calls the template init function of a checksum which supports context | |||||
* templates and installs the template into the spa_t. | |||||
*/ | |||||
static void | |||||
zio_checksum_template_init(enum zio_checksum checksum, spa_t *spa) | |||||
{ | |||||
zio_checksum_info_t *ci = &zio_checksum_table[checksum]; | |||||
if (ci->ci_tmpl_init == NULL) | |||||
return; | |||||
if (spa->spa_cksum_tmpls[checksum] != NULL) | |||||
return; | |||||
if (spa->spa_cksum_tmpls[checksum] == NULL) { | |||||
spa->spa_cksum_tmpls[checksum] = | |||||
ci->ci_tmpl_init(&spa->spa_cksum_salt); | |||||
} | |||||
} | |||||
/* | |||||
* Called by a spa_t that's about to be deallocated. This steps through | |||||
* all of the checksum context templates and deallocates any that were | |||||
* initialized using the algorithm-specific template init function. | |||||
*/ | |||||
void | |||||
zio_checksum_templates_free(spa_t *spa) | |||||
{ | |||||
for (enum zio_checksum checksum = 0; | |||||
checksum < ZIO_CHECKSUM_FUNCTIONS; checksum++) { | |||||
if (spa->spa_cksum_tmpls[checksum] != NULL) { | |||||
zio_checksum_info_t *ci = &zio_checksum_table[checksum]; | |||||
ci->ci_tmpl_free(spa->spa_cksum_tmpls[checksum]); | |||||
spa->spa_cksum_tmpls[checksum] = NULL; | |||||
} | |||||
} | |||||
} | |||||
static int | static int | ||||
zio_checksum_verify(const blkptr_t *bp, void *data) | zio_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data) | ||||
{ | { | ||||
uint64_t size; | uint64_t size; | ||||
unsigned int checksum; | unsigned int checksum; | ||||
zio_checksum_info_t *ci; | zio_checksum_info_t *ci; | ||||
zio_cksum_t actual_cksum, expected_cksum, verifier; | zio_cksum_t actual_cksum, expected_cksum, verifier; | ||||
int byteswap; | int byteswap; | ||||
checksum = BP_GET_CHECKSUM(bp); | checksum = BP_GET_CHECKSUM(bp); | ||||
size = BP_GET_PSIZE(bp); | size = BP_GET_PSIZE(bp); | ||||
if (checksum >= ZIO_CHECKSUM_FUNCTIONS) | if (checksum >= ZIO_CHECKSUM_FUNCTIONS) | ||||
return (EINVAL); | return (EINVAL); | ||||
ci = &zio_checksum_table[checksum]; | ci = &zio_checksum_table[checksum]; | ||||
if (ci->ci_func[0] == NULL || ci->ci_func[1] == NULL) | if (ci->ci_func[0] == NULL || ci->ci_func[1] == NULL) | ||||
return (EINVAL); | return (EINVAL); | ||||
if (ci->ci_eck) { | zio_checksum_template_init(checksum, (spa_t *) spa); | ||||
if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) { | |||||
zio_eck_t *eck; | zio_eck_t *eck; | ||||
ASSERT(checksum == ZIO_CHECKSUM_GANG_HEADER || | ASSERT(checksum == ZIO_CHECKSUM_GANG_HEADER || | ||||
checksum == ZIO_CHECKSUM_LABEL); | checksum == ZIO_CHECKSUM_LABEL); | ||||
eck = (zio_eck_t *)((char *)data + size) - 1; | eck = (zio_eck_t *)((char *)data + size) - 1; | ||||
if (checksum == ZIO_CHECKSUM_GANG_HEADER) | if (checksum == ZIO_CHECKSUM_GANG_HEADER) | ||||
zio_checksum_gang_verifier(&verifier, bp); | zio_checksum_gang_verifier(&verifier, bp); | ||||
else if (checksum == ZIO_CHECKSUM_LABEL) | else if (checksum == ZIO_CHECKSUM_LABEL) | ||||
zio_checksum_label_verifier(&verifier, | zio_checksum_label_verifier(&verifier, | ||||
DVA_GET_OFFSET(BP_IDENTITY(bp))); | DVA_GET_OFFSET(BP_IDENTITY(bp))); | ||||
else | else | ||||
verifier = bp->blk_cksum; | verifier = bp->blk_cksum; | ||||
byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC)); | byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC)); | ||||
if (byteswap) | if (byteswap) | ||||
byteswap_uint64_array(&verifier, sizeof (zio_cksum_t)); | byteswap_uint64_array(&verifier, sizeof (zio_cksum_t)); | ||||
expected_cksum = eck->zec_cksum; | expected_cksum = eck->zec_cksum; | ||||
eck->zec_cksum = verifier; | eck->zec_cksum = verifier; | ||||
ci->ci_func[byteswap](data, size, &actual_cksum); | ci->ci_func[byteswap](data, size, | ||||
spa->spa_cksum_tmpls[checksum], &actual_cksum); | |||||
eck->zec_cksum = expected_cksum; | eck->zec_cksum = expected_cksum; | ||||
if (byteswap) | if (byteswap) | ||||
byteswap_uint64_array(&expected_cksum, | byteswap_uint64_array(&expected_cksum, | ||||
sizeof (zio_cksum_t)); | sizeof (zio_cksum_t)); | ||||
} else { | } else { | ||||
expected_cksum = bp->blk_cksum; | expected_cksum = bp->blk_cksum; | ||||
ci->ci_func[0](data, size, &actual_cksum); | ci->ci_func[0](data, size, spa->spa_cksum_tmpls[checksum], | ||||
&actual_cksum); | |||||
} | } | ||||
if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) { | if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) { | ||||
/*printf("ZFS: read checksum failed\n");*/ | /*printf("ZFS: read checksum %s failed\n", ci->ci_name);*/ | ||||
return (EIO); | return (EIO); | ||||
} | } | ||||
return (0); | return (0); | ||||
} | } | ||||
static int | static int | ||||
zio_decompress_data(int cpfunc, void *src, uint64_t srcsize, | zio_decompress_data(int cpfunc, void *src, uint64_t srcsize, | ||||
▲ Show 20 Lines • Show All 993 Lines • ▼ Show 20 Lines | vdev_child(vdev_t *pvd, uint64_t devidx) | ||||
return (cvd); | return (cvd); | ||||
} | } | ||||
/* | /* | ||||
* We keep track of whether or not there were any injected errors, so that | * We keep track of whether or not there were any injected errors, so that | ||||
* any ereports we generate can note it. | * any ereports we generate can note it. | ||||
*/ | */ | ||||
static int | static int | ||||
raidz_checksum_verify(const blkptr_t *bp, void *data, uint64_t size) | raidz_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data, | ||||
uint64_t size) | |||||
{ | { | ||||
return (zio_checksum_verify(spa, bp, data)); | |||||
return (zio_checksum_verify(bp, data)); | |||||
} | } | ||||
/* | /* | ||||
* Generate the parity from the data columns. If we tried and were able to | * Generate the parity from the data columns. If we tried and were able to | ||||
* read the parity without error, verify that the generated parity matches the | * read the parity without error, verify that the generated parity matches the | ||||
* data we read. If it doesn't, we fire off a checksum error. Return the | * data we read. If it doesn't, we fire off a checksum error. Return the | ||||
* number such failures. | * number such failures. | ||||
*/ | */ | ||||
Show All 32 Lines | |||||
* Iterate over all combinations of bad data and attempt a reconstruction. | * Iterate over all combinations of bad data and attempt a reconstruction. | ||||
* Note that the algorithm below is non-optimal because it doesn't take into | * Note that the algorithm below is non-optimal because it doesn't take into | ||||
* account how reconstruction is actually performed. For example, with | * account how reconstruction is actually performed. For example, with | ||||
* triple-parity RAID-Z the reconstruction procedure is the same if column 4 | * triple-parity RAID-Z the reconstruction procedure is the same if column 4 | ||||
* is targeted as invalid as if columns 1 and 4 are targeted since in both | * is targeted as invalid as if columns 1 and 4 are targeted since in both | ||||
* cases we'd only use parity information in column 0. | * cases we'd only use parity information in column 0. | ||||
*/ | */ | ||||
static int | static int | ||||
vdev_raidz_combrec(raidz_map_t *rm, const blkptr_t *bp, void *data, | vdev_raidz_combrec(const spa_t *spa, raidz_map_t *rm, const blkptr_t *bp, | ||||
off_t offset, uint64_t bytes, int total_errors, int data_errors) | void *data, off_t offset, uint64_t bytes, int total_errors, int data_errors) | ||||
{ | { | ||||
raidz_col_t *rc; | raidz_col_t *rc; | ||||
void *orig[VDEV_RAIDZ_MAXPARITY]; | void *orig[VDEV_RAIDZ_MAXPARITY]; | ||||
int tstore[VDEV_RAIDZ_MAXPARITY + 2]; | int tstore[VDEV_RAIDZ_MAXPARITY + 2]; | ||||
int *tgts = &tstore[1]; | int *tgts = &tstore[1]; | ||||
int current, next, i, c, n; | int current, next, i, c, n; | ||||
int code, ret = 0; | int code, ret = 0; | ||||
▲ Show 20 Lines • Show All 62 Lines • ▼ Show 20 Lines | while (current != n) { | ||||
bcopy(rc->rc_data, orig[i], rc->rc_size); | bcopy(rc->rc_data, orig[i], rc->rc_size); | ||||
} | } | ||||
/* | /* | ||||
* Attempt a reconstruction and exit the outer loop on | * Attempt a reconstruction and exit the outer loop on | ||||
* success. | * success. | ||||
*/ | */ | ||||
code = vdev_raidz_reconstruct(rm, tgts, n); | code = vdev_raidz_reconstruct(rm, tgts, n); | ||||
if (raidz_checksum_verify(bp, data, bytes) == 0) { | if (raidz_checksum_verify(spa, bp, data, bytes) == 0) { | ||||
for (i = 0; i < n; i++) { | for (i = 0; i < n; i++) { | ||||
c = tgts[i]; | c = tgts[i]; | ||||
rc = &rm->rm_col[c]; | rc = &rm->rm_col[c]; | ||||
ASSERT(rc->rc_error == 0); | ASSERT(rc->rc_error == 0); | ||||
rc->rc_error = ECKSUM; | rc->rc_error = ECKSUM; | ||||
} | } | ||||
ret = code; | ret = code; | ||||
▲ Show 20 Lines • Show All 154 Lines • ▼ Show 20 Lines | reconstruct: | ||||
/* | /* | ||||
* If the number of errors we saw was correctable -- less than or equal | * If the number of errors we saw was correctable -- less than or equal | ||||
* to the number of parity disks read -- attempt to produce data that | * to the number of parity disks read -- attempt to produce data that | ||||
* has a valid checksum. Naturally, this case applies in the absence of | * has a valid checksum. Naturally, this case applies in the absence of | ||||
* any errors. | * any errors. | ||||
*/ | */ | ||||
if (total_errors <= rm->rm_firstdatacol - parity_untried) { | if (total_errors <= rm->rm_firstdatacol - parity_untried) { | ||||
if (data_errors == 0) { | if (data_errors == 0) { | ||||
if (raidz_checksum_verify(bp, data, bytes) == 0) { | if (raidz_checksum_verify(vd->spa, bp, data, bytes) == 0) { | ||||
/* | /* | ||||
* If we read parity information (unnecessarily | * If we read parity information (unnecessarily | ||||
* as it happens since no reconstruction was | * as it happens since no reconstruction was | ||||
* needed) regenerate and verify the parity. | * needed) regenerate and verify the parity. | ||||
* We also regenerate parity when resilvering | * We also regenerate parity when resilvering | ||||
* so we can write it out to the failed device | * so we can write it out to the failed device | ||||
* later. | * later. | ||||
*/ | */ | ||||
Show All 28 Lines | if (data_errors == 0) { | ||||
tgts[n++] = c; | tgts[n++] = c; | ||||
} | } | ||||
} | } | ||||
ASSERT(rm->rm_firstdatacol >= n); | ASSERT(rm->rm_firstdatacol >= n); | ||||
code = vdev_raidz_reconstruct(rm, tgts, n); | code = vdev_raidz_reconstruct(rm, tgts, n); | ||||
if (raidz_checksum_verify(bp, data, bytes) == 0) { | if (raidz_checksum_verify(vd->spa, bp, data, bytes) == 0) { | ||||
/* | /* | ||||
* If we read more parity disks than were used | * If we read more parity disks than were used | ||||
* for reconstruction, confirm that the other | * for reconstruction, confirm that the other | ||||
* parity disks produced correct data. This | * parity disks produced correct data. This | ||||
* routine is suboptimal in that it regenerates | * routine is suboptimal in that it regenerates | ||||
* the parity that we already used in addition | * the parity that we already used in addition | ||||
* to the parity that we're attempting to | * to the parity that we're attempting to | ||||
* verify, but this should be a relatively | * verify, but this should be a relatively | ||||
▲ Show 20 Lines • Show All 57 Lines • ▼ Show 20 Lines | reconstruct: | ||||
* in absent data. We check if there is enough additional data to | * in absent data. We check if there is enough additional data to | ||||
* possibly reconstruct the data and then perform combinatorial | * possibly reconstruct the data and then perform combinatorial | ||||
* reconstruction over all possible combinations. If that fails, | * reconstruction over all possible combinations. If that fails, | ||||
* we're cooked. | * we're cooked. | ||||
*/ | */ | ||||
if (total_errors > rm->rm_firstdatacol) { | if (total_errors > rm->rm_firstdatacol) { | ||||
error = EIO; | error = EIO; | ||||
} else if (total_errors < rm->rm_firstdatacol && | } else if (total_errors < rm->rm_firstdatacol && | ||||
(code = vdev_raidz_combrec(rm, bp, data, offset, bytes, | (code = vdev_raidz_combrec(vd->spa, rm, bp, data, offset, bytes, | ||||
total_errors, data_errors)) != 0) { | total_errors, data_errors)) != 0) { | ||||
/* | /* | ||||
* If we didn't use all the available parity for the | * If we didn't use all the available parity for the | ||||
* combinatorial reconstruction, verify that the remaining | * combinatorial reconstruction, verify that the remaining | ||||
* parity is correct. | * parity is correct. | ||||
*/ | */ | ||||
if (code != (1 << rm->rm_firstdatacol) - 1) | if (code != (1 << rm->rm_firstdatacol) - 1) | ||||
(void) raidz_parity_verify(rm); | (void) raidz_parity_verify(rm); | ||||
Show All 21 Lines |