diff --git a/cmd/zstream/zstream_decompress.c b/cmd/zstream/zstream_decompress.c index 726c3be6dc6e..0cef36c0441f 100644 --- a/cmd/zstream/zstream_decompress.c +++ b/cmd/zstream/zstream_decompress.c @@ -1,400 +1,403 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2022 Axcient. All rights reserved. * Use is subject to license terms. */ #include #include #include #include #include #include #include #include #include "zfs_fletcher.h" #include "zstream.h" static int dump_record(dmu_replay_record_t *drr, void *payload, int payload_len, zio_cksum_t *zc, int outfd) { assert(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum) == sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t)); fletcher_4_incremental_native(drr, offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), zc); if (drr->drr_type != DRR_BEGIN) { assert(ZIO_CHECKSUM_IS_ZERO(&drr->drr_u. drr_checksum.drr_checksum)); drr->drr_u.drr_checksum.drr_checksum = *zc; } fletcher_4_incremental_native(&drr->drr_u.drr_checksum.drr_checksum, sizeof (zio_cksum_t), zc); if (write(outfd, drr, sizeof (*drr)) == -1) return (errno); if (payload_len != 0) { fletcher_4_incremental_native(payload, payload_len, zc); if (write(outfd, payload, payload_len) == -1) return (errno); } return (0); } int zstream_do_decompress(int argc, char *argv[]) { const int KEYSIZE = 64; int bufsz = SPA_MAXBLOCKSIZE; char *buf = safe_malloc(bufsz); dmu_replay_record_t thedrr; dmu_replay_record_t *drr = &thedrr; zio_cksum_t stream_cksum; int c; boolean_t verbose = B_FALSE; while ((c = getopt(argc, argv, "v")) != -1) { switch (c) { case 'v': verbose = B_TRUE; break; case '?': (void) fprintf(stderr, "invalid option '%c'\n", optopt); zstream_usage(); break; } } argc -= optind; argv += optind; if (argc < 0) zstream_usage(); if (hcreate(argc) == 0) errx(1, "hcreate"); for (int i = 0; i < argc; i++) { uint64_t object, offset; char *obj_str; char *offset_str; char *key; char *end; enum zio_compress type = ZIO_COMPRESS_LZ4; obj_str = strsep(&argv[i], ","); if (argv[i] == NULL) { zstream_usage(); exit(2); } errno = 0; object = strtoull(obj_str, &end, 0); if (errno || *end != '\0') errx(1, "invalid value for object"); offset_str = strsep(&argv[i], ","); offset = strtoull(offset_str, &end, 0); if (errno || *end != '\0') errx(1, "invalid value for offset"); if (argv[i]) { if (0 == strcmp("off", argv[i])) type = ZIO_COMPRESS_OFF; else if (0 == strcmp("lz4", argv[i])) type = ZIO_COMPRESS_LZ4; else if (0 == strcmp("lzjb", argv[i])) type = ZIO_COMPRESS_LZJB; else if (0 == strcmp("gzip", argv[i])) type = ZIO_COMPRESS_GZIP_1; else if (0 == strcmp("zle", argv[i])) type = ZIO_COMPRESS_ZLE; else if (0 == strcmp("zstd", argv[i])) type = ZIO_COMPRESS_ZSTD; else { fprintf(stderr, "Invalid compression type %s.\n" "Supported types are off, lz4, lzjb, gzip, " "zle, and zstd\n", argv[i]); exit(2); } } if (asprintf(&key, "%llu,%llu", (u_longlong_t)object, (u_longlong_t)offset) < 0) { err(1, "asprintf"); } ENTRY e = {.key = key}; ENTRY *p; p = hsearch(e, ENTER); if (p == NULL) errx(1, "hsearch"); p->data = (void*)(intptr_t)type; } if (isatty(STDIN_FILENO)) { (void) fprintf(stderr, "Error: The send stream is a binary format " "and can not be read from a\n" "terminal. Standard input must be redirected.\n"); exit(1); } fletcher_4_init(); int begin = 0; boolean_t seen = B_FALSE; while (sfread(drr, sizeof (*drr), stdin) != 0) { struct drr_write *drrw; uint64_t payload_size = 0; /* * We need to regenerate the checksum. */ if (drr->drr_type != DRR_BEGIN) { memset(&drr->drr_u.drr_checksum.drr_checksum, 0, sizeof (drr->drr_u.drr_checksum.drr_checksum)); } switch (drr->drr_type) { case DRR_BEGIN: { ZIO_SET_CHECKSUM(&stream_cksum, 0, 0, 0, 0); VERIFY0(begin++); seen = B_TRUE; - int sz = drr->drr_payloadlen; + uint32_t sz = drr->drr_payloadlen; + + VERIFY3U(sz, <=, 1U << 28); + if (sz != 0) { if (sz > bufsz) { buf = realloc(buf, sz); if (buf == NULL) err(1, "realloc"); bufsz = sz; } (void) sfread(buf, sz, stdin); } payload_size = sz; break; } case DRR_END: { struct drr_end *drre = &drr->drr_u.drr_end; /* * We would prefer to just check --begin == 0, but * replication streams have an end of stream END * record, so we must avoid tripping it. */ VERIFY3B(seen, ==, B_TRUE); begin--; /* * Use the recalculated checksum, unless this is * the END record of a stream package, which has * no checksum. */ if (!ZIO_CHECKSUM_IS_ZERO(&drre->drr_checksum)) drre->drr_checksum = stream_cksum; break; } case DRR_OBJECT: { struct drr_object *drro = &drr->drr_u.drr_object; VERIFY3S(begin, ==, 1); if (drro->drr_bonuslen > 0) { payload_size = DRR_OBJECT_PAYLOAD_SIZE(drro); (void) sfread(buf, payload_size, stdin); } break; } case DRR_SPILL: { struct drr_spill *drrs = &drr->drr_u.drr_spill; VERIFY3S(begin, ==, 1); payload_size = DRR_SPILL_PAYLOAD_SIZE(drrs); (void) sfread(buf, payload_size, stdin); break; } case DRR_WRITE_BYREF: VERIFY3S(begin, ==, 1); fprintf(stderr, "Deduplicated streams are not supported\n"); exit(1); break; case DRR_WRITE: { VERIFY3S(begin, ==, 1); drrw = &thedrr.drr_u.drr_write; payload_size = DRR_WRITE_PAYLOAD_SIZE(drrw); ENTRY *p; char key[KEYSIZE]; snprintf(key, KEYSIZE, "%llu,%llu", (u_longlong_t)drrw->drr_object, (u_longlong_t)drrw->drr_offset); ENTRY e = {.key = key}; p = hsearch(e, FIND); if (p != NULL) { zio_decompress_func_t *xfunc = NULL; switch ((enum zio_compress)(intptr_t)p->data) { case ZIO_COMPRESS_OFF: xfunc = NULL; break; case ZIO_COMPRESS_LZJB: xfunc = lzjb_decompress; break; case ZIO_COMPRESS_GZIP_1: xfunc = gzip_decompress; break; case ZIO_COMPRESS_ZLE: xfunc = zle_decompress; break; case ZIO_COMPRESS_LZ4: xfunc = lz4_decompress_zfs; break; case ZIO_COMPRESS_ZSTD: xfunc = zfs_zstd_decompress; break; default: assert(B_FALSE); } /* * Read and decompress the block */ char *lzbuf = safe_calloc(payload_size); (void) sfread(lzbuf, payload_size, stdin); if (xfunc == NULL) { memcpy(buf, lzbuf, payload_size); drrw->drr_compressiontype = ZIO_COMPRESS_OFF; if (verbose) fprintf(stderr, "Resetting " "compression type to off " "for ino %llu offset " "%llu\n", (u_longlong_t) drrw->drr_object, (u_longlong_t) drrw->drr_offset); } else if (0 != xfunc(lzbuf, buf, payload_size, payload_size, 0)) { /* * The block must not be compressed, * at least not with this compression * type, possibly because it gets * written multiple times in this * stream. */ warnx("decompression failed for " "ino %llu offset %llu", (u_longlong_t)drrw->drr_object, (u_longlong_t)drrw->drr_offset); memcpy(buf, lzbuf, payload_size); } else if (verbose) { drrw->drr_compressiontype = ZIO_COMPRESS_OFF; fprintf(stderr, "successfully " "decompressed ino %llu " "offset %llu\n", (u_longlong_t)drrw->drr_object, (u_longlong_t)drrw->drr_offset); } else { drrw->drr_compressiontype = ZIO_COMPRESS_OFF; } free(lzbuf); } else { /* * Read the contents of the block unaltered */ (void) sfread(buf, payload_size, stdin); } break; } case DRR_WRITE_EMBEDDED: { VERIFY3S(begin, ==, 1); struct drr_write_embedded *drrwe = &drr->drr_u.drr_write_embedded; payload_size = P2ROUNDUP((uint64_t)drrwe->drr_psize, 8); (void) sfread(buf, payload_size, stdin); break; } case DRR_FREEOBJECTS: case DRR_FREE: case DRR_OBJECT_RANGE: VERIFY3S(begin, ==, 1); break; default: (void) fprintf(stderr, "INVALID record type 0x%x\n", drr->drr_type); /* should never happen, so assert */ assert(B_FALSE); } if (feof(stdout)) { fprintf(stderr, "Error: unexpected end-of-file\n"); exit(1); } if (ferror(stdout)) { fprintf(stderr, "Error while reading file: %s\n", strerror(errno)); exit(1); } /* * We need to recalculate the checksum, and it needs to be * initially zero to do that. BEGIN records don't have * a checksum. */ if (drr->drr_type != DRR_BEGIN) { memset(&drr->drr_u.drr_checksum.drr_checksum, 0, sizeof (drr->drr_u.drr_checksum.drr_checksum)); } if (dump_record(drr, buf, payload_size, &stream_cksum, STDOUT_FILENO) != 0) break; if (drr->drr_type == DRR_END) { /* * Typically the END record is either the last * thing in the stream, or it is followed * by a BEGIN record (which also zeros the checksum). * However, a stream package ends with two END * records. The last END record's checksum starts * from zero. */ ZIO_SET_CHECKSUM(&stream_cksum, 0, 0, 0, 0); } } free(buf); fletcher_4_fini(); hdestroy(); return (0); } diff --git a/cmd/zstream/zstream_recompress.c b/cmd/zstream/zstream_recompress.c index 38ef758f8ea4..8392ef3de72f 100644 --- a/cmd/zstream/zstream_recompress.c +++ b/cmd/zstream/zstream_recompress.c @@ -1,373 +1,376 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2022 Axcient. All rights reserved. * Use is subject to license terms. */ /* * Copyright (c) 2022 by Delphix. All rights reserved. */ #include #include #include #include #include #include #include #include "zfs_fletcher.h" #include "zstream.h" static int dump_record(dmu_replay_record_t *drr, void *payload, int payload_len, zio_cksum_t *zc, int outfd) { assert(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum) == sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t)); fletcher_4_incremental_native(drr, offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), zc); if (drr->drr_type != DRR_BEGIN) { assert(ZIO_CHECKSUM_IS_ZERO(&drr->drr_u. drr_checksum.drr_checksum)); drr->drr_u.drr_checksum.drr_checksum = *zc; } fletcher_4_incremental_native(&drr->drr_u.drr_checksum.drr_checksum, sizeof (zio_cksum_t), zc); if (write(outfd, drr, sizeof (*drr)) == -1) return (errno); if (payload_len != 0) { fletcher_4_incremental_native(payload, payload_len, zc); if (write(outfd, payload, payload_len) == -1) return (errno); } return (0); } int zstream_do_recompress(int argc, char *argv[]) { int bufsz = SPA_MAXBLOCKSIZE; char *buf = safe_malloc(bufsz); dmu_replay_record_t thedrr; dmu_replay_record_t *drr = &thedrr; zio_cksum_t stream_cksum; int c; int level = -1; while ((c = getopt(argc, argv, "l:")) != -1) { switch (c) { case 'l': if (sscanf(optarg, "%d", &level) != 0) { fprintf(stderr, "failed to parse level '%s'\n", optarg); zstream_usage(); } break; case '?': (void) fprintf(stderr, "invalid option '%c'\n", optopt); zstream_usage(); break; } } argc -= optind; argv += optind; if (argc != 1) zstream_usage(); int type = 0; zio_compress_info_t *cinfo = NULL; if (0 == strcmp(argv[0], "off")) { type = ZIO_COMPRESS_OFF; cinfo = &zio_compress_table[type]; } else if (0 == strcmp(argv[0], "inherit") || 0 == strcmp(argv[0], "empty") || 0 == strcmp(argv[0], "on")) { // Fall through to invalid compression type case } else { for (int i = 0; i < ZIO_COMPRESS_FUNCTIONS; i++) { if (0 == strcmp(zio_compress_table[i].ci_name, argv[0])) { cinfo = &zio_compress_table[i]; type = i; break; } } } if (cinfo == NULL) { fprintf(stderr, "Invalid compression type %s.\n", argv[0]); exit(2); } if (cinfo->ci_compress == NULL) { type = 0; cinfo = &zio_compress_table[0]; } if (isatty(STDIN_FILENO)) { (void) fprintf(stderr, "Error: The send stream is a binary format " "and can not be read from a\n" "terminal. Standard input must be redirected.\n"); exit(1); } fletcher_4_init(); zio_init(); zstd_init(); int begin = 0; boolean_t seen = B_FALSE; while (sfread(drr, sizeof (*drr), stdin) != 0) { struct drr_write *drrw; uint64_t payload_size = 0; /* * We need to regenerate the checksum. */ if (drr->drr_type != DRR_BEGIN) { memset(&drr->drr_u.drr_checksum.drr_checksum, 0, sizeof (drr->drr_u.drr_checksum.drr_checksum)); } switch (drr->drr_type) { case DRR_BEGIN: { ZIO_SET_CHECKSUM(&stream_cksum, 0, 0, 0, 0); VERIFY0(begin++); seen = B_TRUE; - int sz = drr->drr_payloadlen; + uint32_t sz = drr->drr_payloadlen; + + VERIFY3U(sz, <=, 1U << 28); + if (sz != 0) { if (sz > bufsz) { buf = realloc(buf, sz); if (buf == NULL) err(1, "realloc"); bufsz = sz; } (void) sfread(buf, sz, stdin); } payload_size = sz; break; } case DRR_END: { struct drr_end *drre = &drr->drr_u.drr_end; /* * We would prefer to just check --begin == 0, but * replication streams have an end of stream END * record, so we must avoid tripping it. */ VERIFY3B(seen, ==, B_TRUE); begin--; /* * Use the recalculated checksum, unless this is * the END record of a stream package, which has * no checksum. */ if (!ZIO_CHECKSUM_IS_ZERO(&drre->drr_checksum)) drre->drr_checksum = stream_cksum; break; } case DRR_OBJECT: { struct drr_object *drro = &drr->drr_u.drr_object; VERIFY3S(begin, ==, 1); if (drro->drr_bonuslen > 0) { payload_size = DRR_OBJECT_PAYLOAD_SIZE(drro); (void) sfread(buf, payload_size, stdin); } break; } case DRR_SPILL: { struct drr_spill *drrs = &drr->drr_u.drr_spill; VERIFY3S(begin, ==, 1); payload_size = DRR_SPILL_PAYLOAD_SIZE(drrs); (void) sfread(buf, payload_size, stdin); break; } case DRR_WRITE_BYREF: VERIFY3S(begin, ==, 1); fprintf(stderr, "Deduplicated streams are not supported\n"); exit(1); break; case DRR_WRITE: { VERIFY3S(begin, ==, 1); drrw = &thedrr.drr_u.drr_write; payload_size = DRR_WRITE_PAYLOAD_SIZE(drrw); /* * In order to recompress an encrypted block, you have * to decrypt, decompress, recompress, and * re-encrypt. That can be a future enhancement (along * with decryption or re-encryption), but for now we * skip encrypted blocks. */ boolean_t encrypted = B_FALSE; for (int i = 0; i < ZIO_DATA_SALT_LEN; i++) { if (drrw->drr_salt[i] != 0) { encrypted = B_TRUE; break; } } if (encrypted) { (void) sfread(buf, payload_size, stdin); break; } if (drrw->drr_compressiontype >= ZIO_COMPRESS_FUNCTIONS) { fprintf(stderr, "Invalid compression type in " "stream: %d\n", drrw->drr_compressiontype); exit(3); } zio_compress_info_t *dinfo = &zio_compress_table[drrw->drr_compressiontype]; /* Set up buffers to minimize memcpys */ char *cbuf, *dbuf; if (cinfo->ci_compress == NULL) dbuf = buf; else dbuf = safe_calloc(bufsz); if (dinfo->ci_decompress == NULL) cbuf = dbuf; else cbuf = safe_calloc(payload_size); /* Read and decompress the payload */ (void) sfread(cbuf, payload_size, stdin); if (dinfo->ci_decompress != NULL) { if (0 != dinfo->ci_decompress(cbuf, dbuf, payload_size, MIN(bufsz, drrw->drr_logical_size), dinfo->ci_level)) { warnx("decompression type %d failed " "for ino %llu offset %llu", type, (u_longlong_t)drrw->drr_object, (u_longlong_t)drrw->drr_offset); exit(4); } payload_size = drrw->drr_logical_size; free(cbuf); } /* Recompress the payload */ if (cinfo->ci_compress != NULL) { payload_size = P2ROUNDUP(cinfo->ci_compress( dbuf, buf, drrw->drr_logical_size, MIN(payload_size, bufsz), (level == -1 ? cinfo->ci_level : level)), SPA_MINBLOCKSIZE); if (payload_size != drrw->drr_logical_size) { drrw->drr_compressiontype = type; drrw->drr_compressed_size = payload_size; } else { memcpy(buf, dbuf, payload_size); drrw->drr_compressiontype = 0; drrw->drr_compressed_size = 0; } free(dbuf); } else { drrw->drr_compressiontype = type; drrw->drr_compressed_size = 0; } break; } case DRR_WRITE_EMBEDDED: { struct drr_write_embedded *drrwe = &drr->drr_u.drr_write_embedded; VERIFY3S(begin, ==, 1); payload_size = P2ROUNDUP((uint64_t)drrwe->drr_psize, 8); (void) sfread(buf, payload_size, stdin); break; } case DRR_FREEOBJECTS: case DRR_FREE: case DRR_OBJECT_RANGE: VERIFY3S(begin, ==, 1); break; default: (void) fprintf(stderr, "INVALID record type 0x%x\n", drr->drr_type); /* should never happen, so assert */ assert(B_FALSE); } if (feof(stdout)) { fprintf(stderr, "Error: unexpected end-of-file\n"); exit(1); } if (ferror(stdout)) { fprintf(stderr, "Error while reading file: %s\n", strerror(errno)); exit(1); } /* * We need to recalculate the checksum, and it needs to be * initially zero to do that. BEGIN records don't have * a checksum. */ if (drr->drr_type != DRR_BEGIN) { memset(&drr->drr_u.drr_checksum.drr_checksum, 0, sizeof (drr->drr_u.drr_checksum.drr_checksum)); } if (dump_record(drr, buf, payload_size, &stream_cksum, STDOUT_FILENO) != 0) break; if (drr->drr_type == DRR_END) { /* * Typically the END record is either the last * thing in the stream, or it is followed * by a BEGIN record (which also zeros the checksum). * However, a stream package ends with two END * records. The last END record's checksum starts * from zero. */ ZIO_SET_CHECKSUM(&stream_cksum, 0, 0, 0, 0); } } free(buf); fletcher_4_fini(); zio_fini(); zstd_fini(); return (0); } diff --git a/cmd/zstream/zstream_redup.c b/cmd/zstream/zstream_redup.c index 8b12303c5d30..c56a09cee75d 100644 --- a/cmd/zstream/zstream_redup.c +++ b/cmd/zstream/zstream_redup.c @@ -1,486 +1,489 @@ /* * CDDL HEADER START * * This file and its contents are supplied under the terms of the * Common Development and Distribution License ("CDDL"), version 1.0. * You may only use this file in accordance with the terms of version * 1.0 of the CDDL. * * A full copy of the text of the CDDL should have accompanied this * source. A copy of the CDDL is also available via the Internet at * http://www.illumos.org/license/CDDL. * * CDDL HEADER END */ /* * Copyright (c) 2020 by Delphix. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zfs_fletcher.h" #include "zstream.h" #define MAX_RDT_PHYSMEM_PERCENT 20 #define SMALLEST_POSSIBLE_MAX_RDT_MB 128 typedef struct redup_entry { struct redup_entry *rde_next; uint64_t rde_guid; uint64_t rde_object; uint64_t rde_offset; uint64_t rde_stream_offset; } redup_entry_t; typedef struct redup_table { redup_entry_t **redup_hash_array; umem_cache_t *ddecache; uint64_t ddt_count; int numhashbits; } redup_table_t; int highbit64(uint64_t i) { if (i == 0) return (0); return (NBBY * sizeof (uint64_t) - __builtin_clzll(i)); } void * safe_calloc(size_t n) { void *rv = calloc(1, n); if (rv == NULL) { fprintf(stderr, "Error: could not allocate %u bytes of memory\n", (int)n); exit(1); } return (rv); } /* * Safe version of fread(), exits on error. */ int sfread(void *buf, size_t size, FILE *fp) { int rv = fread(buf, size, 1, fp); if (rv == 0 && ferror(fp)) { (void) fprintf(stderr, "Error while reading file: %s\n", strerror(errno)); exit(1); } return (rv); } /* * Safe version of pread(), exits on error. */ static void spread(int fd, void *buf, size_t count, off_t offset) { ssize_t err = pread(fd, buf, count, offset); if (err == -1) { (void) fprintf(stderr, "Error while reading file: %s\n", strerror(errno)); exit(1); } else if (err != count) { (void) fprintf(stderr, "Error while reading file: short read\n"); exit(1); } } static int dump_record(dmu_replay_record_t *drr, void *payload, int payload_len, zio_cksum_t *zc, int outfd) { assert(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum) == sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t)); fletcher_4_incremental_native(drr, offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), zc); if (drr->drr_type != DRR_BEGIN) { assert(ZIO_CHECKSUM_IS_ZERO(&drr->drr_u. drr_checksum.drr_checksum)); drr->drr_u.drr_checksum.drr_checksum = *zc; } fletcher_4_incremental_native(&drr->drr_u.drr_checksum.drr_checksum, sizeof (zio_cksum_t), zc); if (write(outfd, drr, sizeof (*drr)) == -1) return (errno); if (payload_len != 0) { fletcher_4_incremental_native(payload, payload_len, zc); if (write(outfd, payload, payload_len) == -1) return (errno); } return (0); } static void rdt_insert(redup_table_t *rdt, uint64_t guid, uint64_t object, uint64_t offset, uint64_t stream_offset) { uint64_t ch = cityhash4(guid, object, offset, 0); uint64_t hashcode = BF64_GET(ch, 0, rdt->numhashbits); redup_entry_t **rdepp; rdepp = &(rdt->redup_hash_array[hashcode]); redup_entry_t *rde = umem_cache_alloc(rdt->ddecache, UMEM_NOFAIL); rde->rde_next = *rdepp; rde->rde_guid = guid; rde->rde_object = object; rde->rde_offset = offset; rde->rde_stream_offset = stream_offset; *rdepp = rde; rdt->ddt_count++; } static void rdt_lookup(redup_table_t *rdt, uint64_t guid, uint64_t object, uint64_t offset, uint64_t *stream_offsetp) { uint64_t ch = cityhash4(guid, object, offset, 0); uint64_t hashcode = BF64_GET(ch, 0, rdt->numhashbits); for (redup_entry_t *rde = rdt->redup_hash_array[hashcode]; rde != NULL; rde = rde->rde_next) { if (rde->rde_guid == guid && rde->rde_object == object && rde->rde_offset == offset) { *stream_offsetp = rde->rde_stream_offset; return; } } assert(!"could not find expected redup table entry"); } /* * Convert a dedup stream (generated by "zfs send -D") to a * non-deduplicated stream. The entire infd will be converted, including * any substreams in a stream package (generated by "zfs send -RD"). The * infd must be seekable. */ static void zfs_redup_stream(int infd, int outfd, boolean_t verbose) { int bufsz = SPA_MAXBLOCKSIZE; dmu_replay_record_t thedrr = { 0 }; dmu_replay_record_t *drr = &thedrr; redup_table_t rdt; zio_cksum_t stream_cksum; uint64_t numbuckets; uint64_t num_records = 0; uint64_t num_write_byref_records = 0; #ifdef _ILP32 uint64_t max_rde_size = SMALLEST_POSSIBLE_MAX_RDT_MB << 20; #else uint64_t physmem = sysconf(_SC_PHYS_PAGES) * sysconf(_SC_PAGESIZE); uint64_t max_rde_size = MAX((physmem * MAX_RDT_PHYSMEM_PERCENT) / 100, SMALLEST_POSSIBLE_MAX_RDT_MB << 20); #endif numbuckets = max_rde_size / (sizeof (redup_entry_t)); /* * numbuckets must be a power of 2. Increase number to * a power of 2 if necessary. */ if (!ISP2(numbuckets)) numbuckets = 1ULL << highbit64(numbuckets); rdt.redup_hash_array = safe_calloc(numbuckets * sizeof (redup_entry_t *)); rdt.ddecache = umem_cache_create("rde", sizeof (redup_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0); rdt.numhashbits = highbit64(numbuckets) - 1; rdt.ddt_count = 0; char *buf = safe_calloc(bufsz); FILE *ofp = fdopen(infd, "r"); long offset = ftell(ofp); int begin = 0; boolean_t seen = B_FALSE; while (sfread(drr, sizeof (*drr), ofp) != 0) { num_records++; /* * We need to regenerate the checksum. */ if (drr->drr_type != DRR_BEGIN) { memset(&drr->drr_u.drr_checksum.drr_checksum, 0, sizeof (drr->drr_u.drr_checksum.drr_checksum)); } uint64_t payload_size = 0; switch (drr->drr_type) { case DRR_BEGIN: { struct drr_begin *drrb = &drr->drr_u.drr_begin; int fflags; ZIO_SET_CHECKSUM(&stream_cksum, 0, 0, 0, 0); VERIFY0(begin++); seen = B_TRUE; assert(drrb->drr_magic == DMU_BACKUP_MAGIC); /* clear the DEDUP feature flag for this stream */ fflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo); fflags &= ~(DMU_BACKUP_FEATURE_DEDUP | DMU_BACKUP_FEATURE_DEDUPPROPS); /* cppcheck-suppress syntaxError */ DMU_SET_FEATUREFLAGS(drrb->drr_versioninfo, fflags); - int sz = drr->drr_payloadlen; + uint32_t sz = drr->drr_payloadlen; + + VERIFY3U(sz, <=, 1U << 28); + if (sz != 0) { if (sz > bufsz) { free(buf); buf = safe_calloc(sz); bufsz = sz; } (void) sfread(buf, sz, ofp); } payload_size = sz; break; } case DRR_END: { struct drr_end *drre = &drr->drr_u.drr_end; /* * We would prefer to just check --begin == 0, but * replication streams have an end of stream END * record, so we must avoid tripping it. */ VERIFY3B(seen, ==, B_TRUE); begin--; /* * Use the recalculated checksum, unless this is * the END record of a stream package, which has * no checksum. */ if (!ZIO_CHECKSUM_IS_ZERO(&drre->drr_checksum)) drre->drr_checksum = stream_cksum; break; } case DRR_OBJECT: { struct drr_object *drro = &drr->drr_u.drr_object; VERIFY3S(begin, ==, 1); if (drro->drr_bonuslen > 0) { payload_size = DRR_OBJECT_PAYLOAD_SIZE(drro); (void) sfread(buf, payload_size, ofp); } break; } case DRR_SPILL: { struct drr_spill *drrs = &drr->drr_u.drr_spill; VERIFY3S(begin, ==, 1); payload_size = DRR_SPILL_PAYLOAD_SIZE(drrs); (void) sfread(buf, payload_size, ofp); break; } case DRR_WRITE_BYREF: { struct drr_write_byref drrwb = drr->drr_u.drr_write_byref; VERIFY3S(begin, ==, 1); num_write_byref_records++; /* * Look up in hash table by drrwb->drr_refguid, * drr_refobject, drr_refoffset. Replace this * record with the found WRITE record, but with * drr_object,drr_offset,drr_toguid replaced with ours. */ uint64_t stream_offset = 0; rdt_lookup(&rdt, drrwb.drr_refguid, drrwb.drr_refobject, drrwb.drr_refoffset, &stream_offset); spread(infd, drr, sizeof (*drr), stream_offset); assert(drr->drr_type == DRR_WRITE); struct drr_write *drrw = &drr->drr_u.drr_write; assert(drrw->drr_toguid == drrwb.drr_refguid); assert(drrw->drr_object == drrwb.drr_refobject); assert(drrw->drr_offset == drrwb.drr_refoffset); payload_size = DRR_WRITE_PAYLOAD_SIZE(drrw); spread(infd, buf, payload_size, stream_offset + sizeof (*drr)); drrw->drr_toguid = drrwb.drr_toguid; drrw->drr_object = drrwb.drr_object; drrw->drr_offset = drrwb.drr_offset; break; } case DRR_WRITE: { struct drr_write *drrw = &drr->drr_u.drr_write; VERIFY3S(begin, ==, 1); payload_size = DRR_WRITE_PAYLOAD_SIZE(drrw); (void) sfread(buf, payload_size, ofp); rdt_insert(&rdt, drrw->drr_toguid, drrw->drr_object, drrw->drr_offset, offset); break; } case DRR_WRITE_EMBEDDED: { struct drr_write_embedded *drrwe = &drr->drr_u.drr_write_embedded; VERIFY3S(begin, ==, 1); payload_size = P2ROUNDUP((uint64_t)drrwe->drr_psize, 8); (void) sfread(buf, payload_size, ofp); break; } case DRR_FREEOBJECTS: case DRR_FREE: case DRR_OBJECT_RANGE: VERIFY3S(begin, ==, 1); break; default: (void) fprintf(stderr, "INVALID record type 0x%x\n", drr->drr_type); /* should never happen, so assert */ assert(B_FALSE); } if (feof(ofp)) { fprintf(stderr, "Error: unexpected end-of-file\n"); exit(1); } if (ferror(ofp)) { fprintf(stderr, "Error while reading file: %s\n", strerror(errno)); exit(1); } /* * We need to recalculate the checksum, and it needs to be * initially zero to do that. BEGIN records don't have * a checksum. */ if (drr->drr_type != DRR_BEGIN) { memset(&drr->drr_u.drr_checksum.drr_checksum, 0, sizeof (drr->drr_u.drr_checksum.drr_checksum)); } if (dump_record(drr, buf, payload_size, &stream_cksum, outfd) != 0) break; if (drr->drr_type == DRR_END) { /* * Typically the END record is either the last * thing in the stream, or it is followed * by a BEGIN record (which also zeros the checksum). * However, a stream package ends with two END * records. The last END record's checksum starts * from zero. */ ZIO_SET_CHECKSUM(&stream_cksum, 0, 0, 0, 0); } offset = ftell(ofp); } if (verbose) { char mem_str[16]; zfs_nicenum(rdt.ddt_count * sizeof (redup_entry_t), mem_str, sizeof (mem_str)); fprintf(stderr, "converted stream with %llu total records, " "including %llu dedup records, using %sB memory.\n", (long long)num_records, (long long)num_write_byref_records, mem_str); } umem_cache_destroy(rdt.ddecache); free(rdt.redup_hash_array); free(buf); (void) fclose(ofp); } int zstream_do_redup(int argc, char *argv[]) { boolean_t verbose = B_FALSE; int c; while ((c = getopt(argc, argv, "v")) != -1) { switch (c) { case 'v': verbose = B_TRUE; break; case '?': (void) fprintf(stderr, "invalid option '%c'\n", optopt); zstream_usage(); break; } } argc -= optind; argv += optind; if (argc != 1) zstream_usage(); const char *filename = argv[0]; if (isatty(STDOUT_FILENO)) { (void) fprintf(stderr, "Error: Stream can not be written to a terminal.\n" "You must redirect standard output.\n"); return (1); } int fd = open(filename, O_RDONLY); if (fd == -1) { (void) fprintf(stderr, "Error while opening file '%s': %s\n", filename, strerror(errno)); exit(1); } fletcher_4_init(); zfs_redup_stream(fd, STDOUT_FILENO, verbose); fletcher_4_fini(); close(fd); return (0); } diff --git a/lib/libzfs/libzfs_sendrecv.c b/lib/libzfs/libzfs_sendrecv.c index 49ae7d449b70..038613a1fcfa 100644 --- a/lib/libzfs/libzfs_sendrecv.c +++ b/lib/libzfs/libzfs_sendrecv.c @@ -1,5545 +1,5553 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright (c) 2012, Joyent, Inc. All rights reserved. * Copyright (c) 2012 Pawel Jakub Dawidek . * All rights reserved * Copyright (c) 2013 Steven Hartland. All rights reserved. * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved. * Copyright 2016 Igor Kozhukhov * Copyright (c) 2018, loli10K . All rights reserved. * Copyright (c) 2019 Datto Inc. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zfs_namecheck.h" #include "zfs_prop.h" #include "zfs_fletcher.h" #include "libzfs_impl.h" #include #include #include #include #include #include #include static int zfs_receive_impl(libzfs_handle_t *, const char *, const char *, recvflags_t *, int, const char *, nvlist_t *, avl_tree_t *, char **, const char *, nvlist_t *); static int guid_to_name_redact_snaps(libzfs_handle_t *hdl, const char *parent, uint64_t guid, boolean_t bookmark_ok, uint64_t *redact_snap_guids, uint64_t num_redact_snaps, char *name); static int guid_to_name(libzfs_handle_t *, const char *, uint64_t, boolean_t, char *); typedef struct progress_arg { zfs_handle_t *pa_zhp; int pa_fd; boolean_t pa_parsable; boolean_t pa_estimate; int pa_verbosity; boolean_t pa_astitle; uint64_t pa_size; } progress_arg_t; static int dump_record(dmu_replay_record_t *drr, void *payload, size_t payload_len, zio_cksum_t *zc, int outfd) { ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t)); fletcher_4_incremental_native(drr, offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), zc); if (drr->drr_type != DRR_BEGIN) { ASSERT(ZIO_CHECKSUM_IS_ZERO(&drr->drr_u. drr_checksum.drr_checksum)); drr->drr_u.drr_checksum.drr_checksum = *zc; } fletcher_4_incremental_native(&drr->drr_u.drr_checksum.drr_checksum, sizeof (zio_cksum_t), zc); if (write(outfd, drr, sizeof (*drr)) == -1) return (errno); if (payload_len != 0) { fletcher_4_incremental_native(payload, payload_len, zc); if (write(outfd, payload, payload_len) == -1) return (errno); } return (0); } /* * Routines for dealing with the AVL tree of fs-nvlists */ typedef struct fsavl_node { avl_node_t fn_node; nvlist_t *fn_nvfs; char *fn_snapname; uint64_t fn_guid; } fsavl_node_t; static int fsavl_compare(const void *arg1, const void *arg2) { const fsavl_node_t *fn1 = (const fsavl_node_t *)arg1; const fsavl_node_t *fn2 = (const fsavl_node_t *)arg2; return (TREE_CMP(fn1->fn_guid, fn2->fn_guid)); } /* * Given the GUID of a snapshot, find its containing filesystem and * (optionally) name. */ static nvlist_t * fsavl_find(avl_tree_t *avl, uint64_t snapguid, char **snapname) { fsavl_node_t fn_find; fsavl_node_t *fn; fn_find.fn_guid = snapguid; fn = avl_find(avl, &fn_find, NULL); if (fn) { if (snapname) *snapname = fn->fn_snapname; return (fn->fn_nvfs); } return (NULL); } static void fsavl_destroy(avl_tree_t *avl) { fsavl_node_t *fn; void *cookie; if (avl == NULL) return; cookie = NULL; while ((fn = avl_destroy_nodes(avl, &cookie)) != NULL) free(fn); avl_destroy(avl); free(avl); } /* * Given an nvlist, produce an avl tree of snapshots, ordered by guid */ static avl_tree_t * fsavl_create(nvlist_t *fss) { avl_tree_t *fsavl; nvpair_t *fselem = NULL; if ((fsavl = malloc(sizeof (avl_tree_t))) == NULL) return (NULL); avl_create(fsavl, fsavl_compare, sizeof (fsavl_node_t), offsetof(fsavl_node_t, fn_node)); while ((fselem = nvlist_next_nvpair(fss, fselem)) != NULL) { nvlist_t *nvfs, *snaps; nvpair_t *snapelem = NULL; nvfs = fnvpair_value_nvlist(fselem); snaps = fnvlist_lookup_nvlist(nvfs, "snaps"); while ((snapelem = nvlist_next_nvpair(snaps, snapelem)) != NULL) { fsavl_node_t *fn; if ((fn = malloc(sizeof (fsavl_node_t))) == NULL) { fsavl_destroy(fsavl); return (NULL); } fn->fn_nvfs = nvfs; fn->fn_snapname = nvpair_name(snapelem); fn->fn_guid = fnvpair_value_uint64(snapelem); /* * Note: if there are multiple snaps with the * same GUID, we ignore all but one. */ avl_index_t where = 0; if (avl_find(fsavl, fn, &where) == NULL) avl_insert(fsavl, fn, where); else free(fn); } } return (fsavl); } /* * Routines for dealing with the giant nvlist of fs-nvlists, etc. */ typedef struct send_data { /* * assigned inside every recursive call, * restored from *_save on return: * * guid of fromsnap snapshot in parent dataset * txg of fromsnap snapshot in current dataset * txg of tosnap snapshot in current dataset */ uint64_t parent_fromsnap_guid; uint64_t fromsnap_txg; uint64_t tosnap_txg; /* the nvlists get accumulated during depth-first traversal */ nvlist_t *parent_snaps; nvlist_t *fss; nvlist_t *snapprops; nvlist_t *snapholds; /* user holds */ /* send-receive configuration, does not change during traversal */ const char *fsname; const char *fromsnap; const char *tosnap; boolean_t recursive; boolean_t raw; boolean_t doall; boolean_t replicate; boolean_t skipmissing; boolean_t verbose; boolean_t backup; boolean_t seenfrom; boolean_t seento; boolean_t holds; /* were holds requested with send -h */ boolean_t props; /* * The header nvlist is of the following format: * { * "tosnap" -> string * "fromsnap" -> string (if incremental) * "fss" -> { * id -> { * * "name" -> string (full name; for debugging) * "parentfromsnap" -> number (guid of fromsnap in parent) * * "props" -> { name -> value (only if set here) } * "snaps" -> { name (lastname) -> number (guid) } * "snapprops" -> { name (lastname) -> { name -> value } } * "snapholds" -> { name (lastname) -> { holdname -> crtime } } * * "origin" -> number (guid) (if clone) * "is_encroot" -> boolean * "sent" -> boolean (not on-disk) * } * } * } * */ } send_data_t; static void send_iterate_prop(zfs_handle_t *zhp, boolean_t received_only, nvlist_t *nv); /* * Collect guid, valid props, optionally holds, etc. of a snapshot. * This interface is intended for use as a zfs_iter_snapshots_sorted visitor. */ static int send_iterate_snap(zfs_handle_t *zhp, void *arg) { send_data_t *sd = arg; uint64_t guid = zhp->zfs_dmustats.dds_guid; uint64_t txg = zhp->zfs_dmustats.dds_creation_txg; boolean_t isfromsnap, istosnap, istosnapwithnofrom; char *snapname; const char *from = sd->fromsnap; const char *to = sd->tosnap; snapname = strrchr(zhp->zfs_name, '@'); assert(snapname != NULL); ++snapname; isfromsnap = (from != NULL && strcmp(from, snapname) == 0); istosnap = (to != NULL && strcmp(to, snapname) == 0); istosnapwithnofrom = (istosnap && from == NULL); if (sd->tosnap_txg != 0 && txg > sd->tosnap_txg) { if (sd->verbose) { (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "skipping snapshot %s because it was created " "after the destination snapshot (%s)\n"), zhp->zfs_name, to); } zfs_close(zhp); return (0); } fnvlist_add_uint64(sd->parent_snaps, snapname, guid); /* * NB: if there is no fromsnap here (it's a newly created fs in * an incremental replication), we will substitute the tosnap. */ if (isfromsnap || (sd->parent_fromsnap_guid == 0 && istosnap)) sd->parent_fromsnap_guid = guid; if (!sd->recursive) { /* * To allow a doall stream to work properly * with a NULL fromsnap */ if (sd->doall && from == NULL && !sd->seenfrom) sd->seenfrom = B_TRUE; if (!sd->seenfrom && isfromsnap) { sd->seenfrom = B_TRUE; zfs_close(zhp); return (0); } if ((sd->seento || !sd->seenfrom) && !istosnapwithnofrom) { zfs_close(zhp); return (0); } if (istosnap) sd->seento = B_TRUE; } nvlist_t *nv = fnvlist_alloc(); send_iterate_prop(zhp, sd->backup, nv); fnvlist_add_nvlist(sd->snapprops, snapname, nv); fnvlist_free(nv); if (sd->holds) { nvlist_t *holds; if (lzc_get_holds(zhp->zfs_name, &holds) == 0) { fnvlist_add_nvlist(sd->snapholds, snapname, holds); fnvlist_free(holds); } } zfs_close(zhp); return (0); } /* * Collect all valid props from the handle snap into an nvlist. */ static void send_iterate_prop(zfs_handle_t *zhp, boolean_t received_only, nvlist_t *nv) { nvlist_t *props; if (received_only) props = zfs_get_recvd_props(zhp); else props = zhp->zfs_props; nvpair_t *elem = NULL; while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { char *propname = nvpair_name(elem); zfs_prop_t prop = zfs_name_to_prop(propname); if (!zfs_prop_user(propname)) { /* * Realistically, this should never happen. However, * we want the ability to add DSL properties without * needing to make incompatible version changes. We * need to ignore unknown properties to allow older * software to still send datasets containing these * properties, with the unknown properties elided. */ if (prop == ZPROP_INVAL) continue; if (zfs_prop_readonly(prop)) continue; } nvlist_t *propnv = fnvpair_value_nvlist(elem); boolean_t isspacelimit = (prop == ZFS_PROP_QUOTA || prop == ZFS_PROP_RESERVATION || prop == ZFS_PROP_REFQUOTA || prop == ZFS_PROP_REFRESERVATION); if (isspacelimit && zhp->zfs_type == ZFS_TYPE_SNAPSHOT) continue; char *source; if (nvlist_lookup_string(propnv, ZPROP_SOURCE, &source) == 0) { if (strcmp(source, zhp->zfs_name) != 0 && strcmp(source, ZPROP_SOURCE_VAL_RECVD) != 0) continue; } else { /* * May have no source before SPA_VERSION_RECVD_PROPS, * but is still modifiable. */ if (!isspacelimit) continue; } if (zfs_prop_user(propname) || zfs_prop_get_type(prop) == PROP_TYPE_STRING) { char *value; value = fnvlist_lookup_string(propnv, ZPROP_VALUE); fnvlist_add_string(nv, propname, value); } else { uint64_t value; value = fnvlist_lookup_uint64(propnv, ZPROP_VALUE); fnvlist_add_uint64(nv, propname, value); } } } /* * returns snapshot guid * and returns 0 if the snapshot does not exist */ static uint64_t get_snap_guid(libzfs_handle_t *hdl, const char *fs, const char *snap) { char name[MAXPATHLEN + 1]; uint64_t guid = 0; if (fs == NULL || fs[0] == '\0' || snap == NULL || snap[0] == '\0') return (guid); (void) snprintf(name, sizeof (name), "%s@%s", fs, snap); zfs_handle_t *zhp = zfs_open(hdl, name, ZFS_TYPE_SNAPSHOT); if (zhp != NULL) { guid = zfs_prop_get_int(zhp, ZFS_PROP_GUID); zfs_close(zhp); } return (guid); } /* * returns snapshot creation txg * and returns 0 if the snapshot does not exist */ static uint64_t get_snap_txg(libzfs_handle_t *hdl, const char *fs, const char *snap) { char name[ZFS_MAX_DATASET_NAME_LEN]; uint64_t txg = 0; if (fs == NULL || fs[0] == '\0' || snap == NULL || snap[0] == '\0') return (txg); (void) snprintf(name, sizeof (name), "%s@%s", fs, snap); if (zfs_dataset_exists(hdl, name, ZFS_TYPE_SNAPSHOT)) { zfs_handle_t *zhp = zfs_open(hdl, name, ZFS_TYPE_SNAPSHOT); if (zhp != NULL) { txg = zfs_prop_get_int(zhp, ZFS_PROP_CREATETXG); zfs_close(zhp); } } return (txg); } /* * Recursively generate nvlists describing datasets. See comment * for the data structure send_data_t above for description of contents * of the nvlist. */ static int send_iterate_fs(zfs_handle_t *zhp, void *arg) { send_data_t *sd = arg; nvlist_t *nvfs = NULL, *nv = NULL; int rv = 0; uint64_t min_txg = 0, max_txg = 0; uint64_t txg = zhp->zfs_dmustats.dds_creation_txg; uint64_t guid = zhp->zfs_dmustats.dds_guid; uint64_t fromsnap_txg, tosnap_txg; char guidstring[64]; /* These fields are restored on return from a recursive call. */ uint64_t parent_fromsnap_guid_save = sd->parent_fromsnap_guid; uint64_t fromsnap_txg_save = sd->fromsnap_txg; uint64_t tosnap_txg_save = sd->tosnap_txg; fromsnap_txg = get_snap_txg(zhp->zfs_hdl, zhp->zfs_name, sd->fromsnap); if (fromsnap_txg != 0) sd->fromsnap_txg = fromsnap_txg; tosnap_txg = get_snap_txg(zhp->zfs_hdl, zhp->zfs_name, sd->tosnap); if (tosnap_txg != 0) sd->tosnap_txg = tosnap_txg; /* * On the send side, if the current dataset does not have tosnap, * perform two additional checks: * * - Skip sending the current dataset if it was created later than * the parent tosnap. * - Return error if the current dataset was created earlier than * the parent tosnap, unless --skip-missing specified. Then * just print a warning. */ if (sd->tosnap != NULL && tosnap_txg == 0) { if (sd->tosnap_txg != 0 && txg > sd->tosnap_txg) { if (sd->verbose) { (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "skipping dataset %s: snapshot %s does " "not exist\n"), zhp->zfs_name, sd->tosnap); } } else if (sd->skipmissing) { (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "WARNING: skipping dataset %s and its children:" " snapshot %s does not exist\n"), zhp->zfs_name, sd->tosnap); } else { (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "cannot send %s@%s%s: snapshot %s@%s does not " "exist\n"), sd->fsname, sd->tosnap, sd->recursive ? dgettext(TEXT_DOMAIN, " recursively") : "", zhp->zfs_name, sd->tosnap); rv = EZFS_NOENT; } goto out; } nvfs = fnvlist_alloc(); fnvlist_add_string(nvfs, "name", zhp->zfs_name); fnvlist_add_uint64(nvfs, "parentfromsnap", sd->parent_fromsnap_guid); if (zhp->zfs_dmustats.dds_origin[0] != '\0') { zfs_handle_t *origin = zfs_open(zhp->zfs_hdl, zhp->zfs_dmustats.dds_origin, ZFS_TYPE_SNAPSHOT); if (origin == NULL) { rv = -1; goto out; } fnvlist_add_uint64(nvfs, "origin", origin->zfs_dmustats.dds_guid); zfs_close(origin); } /* Iterate over props. */ if (sd->props || sd->backup || sd->recursive) { nv = fnvlist_alloc(); send_iterate_prop(zhp, sd->backup, nv); fnvlist_add_nvlist(nvfs, "props", nv); } if (zfs_prop_get_int(zhp, ZFS_PROP_ENCRYPTION) != ZIO_CRYPT_OFF) { boolean_t encroot; /* Determine if this dataset is an encryption root. */ if (zfs_crypto_get_encryption_root(zhp, &encroot, NULL) != 0) { rv = -1; goto out; } if (encroot) fnvlist_add_boolean(nvfs, "is_encroot"); /* * Encrypted datasets can only be sent with properties if * the raw flag is specified because the receive side doesn't * currently have a mechanism for recursively asking the user * for new encryption parameters. */ if (!sd->raw) { (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "cannot send %s@%s: encrypted dataset %s may not " "be sent with properties without the raw flag\n"), sd->fsname, sd->tosnap, zhp->zfs_name); rv = -1; goto out; } } /* * Iterate over snaps, and set sd->parent_fromsnap_guid. * * If this is a "doall" send, a replicate send or we're just trying * to gather a list of previous snapshots, iterate through all the * snaps in the txg range. Otherwise just look at the one we're * interested in. */ sd->parent_fromsnap_guid = 0; sd->parent_snaps = fnvlist_alloc(); sd->snapprops = fnvlist_alloc(); if (sd->holds) sd->snapholds = fnvlist_alloc(); if (sd->doall || sd->replicate || sd->tosnap == NULL) { if (!sd->replicate && fromsnap_txg != 0) min_txg = fromsnap_txg; if (!sd->replicate && tosnap_txg != 0) max_txg = tosnap_txg; (void) zfs_iter_snapshots_sorted(zhp, 0, send_iterate_snap, sd, min_txg, max_txg); } else { char snapname[MAXPATHLEN] = { 0 }; zfs_handle_t *snap; (void) snprintf(snapname, sizeof (snapname), "%s@%s", zhp->zfs_name, sd->tosnap); if (sd->fromsnap != NULL) sd->seenfrom = B_TRUE; snap = zfs_open(zhp->zfs_hdl, snapname, ZFS_TYPE_SNAPSHOT); if (snap != NULL) (void) send_iterate_snap(snap, sd); } fnvlist_add_nvlist(nvfs, "snaps", sd->parent_snaps); fnvlist_free(sd->parent_snaps); fnvlist_add_nvlist(nvfs, "snapprops", sd->snapprops); fnvlist_free(sd->snapprops); if (sd->holds) { fnvlist_add_nvlist(nvfs, "snapholds", sd->snapholds); fnvlist_free(sd->snapholds); } /* Do not allow the size of the properties list to exceed the limit */ if ((fnvlist_size(nvfs) + fnvlist_size(sd->fss)) > zhp->zfs_hdl->libzfs_max_nvlist) { (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "warning: cannot send %s@%s: the size of the list of " "snapshots and properties is too large to be received " "successfully.\n" "Select a smaller number of snapshots to send.\n"), zhp->zfs_name, sd->tosnap); rv = EZFS_NOSPC; goto out; } /* Add this fs to nvlist. */ (void) snprintf(guidstring, sizeof (guidstring), "0x%llx", (longlong_t)guid); fnvlist_add_nvlist(sd->fss, guidstring, nvfs); /* Iterate over children. */ if (sd->recursive) rv = zfs_iter_filesystems(zhp, 0, send_iterate_fs, sd); out: /* Restore saved fields. */ sd->parent_fromsnap_guid = parent_fromsnap_guid_save; sd->fromsnap_txg = fromsnap_txg_save; sd->tosnap_txg = tosnap_txg_save; fnvlist_free(nv); fnvlist_free(nvfs); zfs_close(zhp); return (rv); } static int gather_nvlist(libzfs_handle_t *hdl, const char *fsname, const char *fromsnap, const char *tosnap, boolean_t recursive, boolean_t raw, boolean_t doall, boolean_t replicate, boolean_t skipmissing, boolean_t verbose, boolean_t backup, boolean_t holds, boolean_t props, nvlist_t **nvlp, avl_tree_t **avlp) { zfs_handle_t *zhp; send_data_t sd = { 0 }; int error; zhp = zfs_open(hdl, fsname, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); if (zhp == NULL) return (EZFS_BADTYPE); sd.fss = fnvlist_alloc(); sd.fsname = fsname; sd.fromsnap = fromsnap; sd.tosnap = tosnap; sd.recursive = recursive; sd.raw = raw; sd.doall = doall; sd.replicate = replicate; sd.skipmissing = skipmissing; sd.verbose = verbose; sd.backup = backup; sd.holds = holds; sd.props = props; if ((error = send_iterate_fs(zhp, &sd)) != 0) { fnvlist_free(sd.fss); if (avlp != NULL) *avlp = NULL; *nvlp = NULL; return (error); } if (avlp != NULL && (*avlp = fsavl_create(sd.fss)) == NULL) { fnvlist_free(sd.fss); *nvlp = NULL; return (EZFS_NOMEM); } *nvlp = sd.fss; return (0); } /* * Routines specific to "zfs send" */ typedef struct send_dump_data { /* these are all just the short snapname (the part after the @) */ const char *fromsnap; const char *tosnap; char prevsnap[ZFS_MAX_DATASET_NAME_LEN]; uint64_t prevsnap_obj; boolean_t seenfrom, seento, replicate, doall, fromorigin; boolean_t dryrun, parsable, progress, embed_data, std_out; boolean_t large_block, compress, raw, holds; boolean_t progressastitle; int outfd; boolean_t err; nvlist_t *fss; nvlist_t *snapholds; avl_tree_t *fsavl; snapfilter_cb_t *filter_cb; void *filter_cb_arg; nvlist_t *debugnv; char holdtag[ZFS_MAX_DATASET_NAME_LEN]; int cleanup_fd; int verbosity; uint64_t size; } send_dump_data_t; static int zfs_send_space(zfs_handle_t *zhp, const char *snapname, const char *from, enum lzc_send_flags flags, uint64_t *spacep) { assert(snapname != NULL); int error = lzc_send_space(snapname, from, flags, spacep); if (error == 0) return (0); char errbuf[ERRBUFLEN]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "warning: cannot estimate space for '%s'"), snapname); libzfs_handle_t *hdl = zhp->zfs_hdl; switch (error) { case EXDEV: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "not an earlier snapshot from the same fs")); return (zfs_error(hdl, EZFS_CROSSTARGET, errbuf)); case ENOENT: if (zfs_dataset_exists(hdl, snapname, ZFS_TYPE_SNAPSHOT)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "incremental source (%s) does not exist"), snapname); } return (zfs_error(hdl, EZFS_NOENT, errbuf)); case EDQUOT: case EFBIG: case EIO: case ENOLINK: case ENOSPC: case ENOSTR: case ENXIO: case EPIPE: case ERANGE: case EFAULT: case EROFS: case EINVAL: zfs_error_aux(hdl, "%s", strerror(error)); return (zfs_error(hdl, EZFS_BADBACKUP, errbuf)); default: return (zfs_standard_error(hdl, error, errbuf)); } } /* * Dumps a backup of the given snapshot (incremental from fromsnap if it's not * NULL) to the file descriptor specified by outfd. */ static int dump_ioctl(zfs_handle_t *zhp, const char *fromsnap, uint64_t fromsnap_obj, boolean_t fromorigin, int outfd, enum lzc_send_flags flags, nvlist_t *debugnv) { zfs_cmd_t zc = {"\0"}; libzfs_handle_t *hdl = zhp->zfs_hdl; nvlist_t *thisdbg; assert(zhp->zfs_type == ZFS_TYPE_SNAPSHOT); assert(fromsnap_obj == 0 || !fromorigin); (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); zc.zc_cookie = outfd; zc.zc_obj = fromorigin; zc.zc_sendobj = zfs_prop_get_int(zhp, ZFS_PROP_OBJSETID); zc.zc_fromobj = fromsnap_obj; zc.zc_flags = flags; if (debugnv != NULL) { thisdbg = fnvlist_alloc(); if (fromsnap != NULL && fromsnap[0] != '\0') fnvlist_add_string(thisdbg, "fromsnap", fromsnap); } if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_SEND, &zc) != 0) { char errbuf[ERRBUFLEN]; int error = errno; (void) snprintf(errbuf, sizeof (errbuf), "%s '%s'", dgettext(TEXT_DOMAIN, "warning: cannot send"), zhp->zfs_name); if (debugnv != NULL) { fnvlist_add_uint64(thisdbg, "error", error); fnvlist_add_nvlist(debugnv, zhp->zfs_name, thisdbg); fnvlist_free(thisdbg); } switch (error) { case EXDEV: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "not an earlier snapshot from the same fs")); return (zfs_error(hdl, EZFS_CROSSTARGET, errbuf)); case EACCES: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "source key must be loaded")); return (zfs_error(hdl, EZFS_CRYPTOFAILED, errbuf)); case ENOENT: if (zfs_dataset_exists(hdl, zc.zc_name, ZFS_TYPE_SNAPSHOT)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "incremental source (@%s) does not exist"), zc.zc_value); } return (zfs_error(hdl, EZFS_NOENT, errbuf)); case EDQUOT: case EFBIG: case EIO: case ENOLINK: case ENOSPC: case ENOSTR: case ENXIO: case EPIPE: case ERANGE: case EFAULT: case EROFS: case EINVAL: zfs_error_aux(hdl, "%s", strerror(errno)); return (zfs_error(hdl, EZFS_BADBACKUP, errbuf)); default: return (zfs_standard_error(hdl, errno, errbuf)); } } if (debugnv != NULL) { fnvlist_add_nvlist(debugnv, zhp->zfs_name, thisdbg); fnvlist_free(thisdbg); } return (0); } static void gather_holds(zfs_handle_t *zhp, send_dump_data_t *sdd) { assert(zhp->zfs_type == ZFS_TYPE_SNAPSHOT); /* * zfs_send() only sets snapholds for sends that need them, * e.g. replication and doall. */ if (sdd->snapholds == NULL) return; fnvlist_add_string(sdd->snapholds, zhp->zfs_name, sdd->holdtag); } int zfs_send_progress(zfs_handle_t *zhp, int fd, uint64_t *bytes_written, uint64_t *blocks_visited) { zfs_cmd_t zc = {"\0"}; if (bytes_written != NULL) *bytes_written = 0; if (blocks_visited != NULL) *blocks_visited = 0; (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); zc.zc_cookie = fd; if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_SEND_PROGRESS, &zc) != 0) return (errno); if (bytes_written != NULL) *bytes_written = zc.zc_cookie; if (blocks_visited != NULL) *blocks_visited = zc.zc_objset_type; return (0); } static void * send_progress_thread(void *arg) { progress_arg_t *pa = arg; zfs_handle_t *zhp = pa->pa_zhp; uint64_t bytes; uint64_t blocks; uint64_t total = pa->pa_size / 100; char buf[16]; time_t t; struct tm tm; int err; if (!pa->pa_parsable && pa->pa_verbosity != 0) { (void) fprintf(stderr, "TIME %s %sSNAPSHOT %s\n", pa->pa_estimate ? "BYTES" : " SENT", pa->pa_verbosity >= 2 ? " BLOCKS " : "", zhp->zfs_name); } /* * Print the progress from ZFS_IOC_SEND_PROGRESS every second. */ for (;;) { (void) sleep(1); if ((err = zfs_send_progress(zhp, pa->pa_fd, &bytes, &blocks)) != 0) { if (err == EINTR || err == ENOENT) return ((void *)0); return ((void *)(uintptr_t)err); } (void) time(&t); localtime_r(&t, &tm); if (pa->pa_astitle) { char buf_bytes[16]; char buf_size[16]; int pct; zfs_nicenum(bytes, buf_bytes, sizeof (buf_bytes)); zfs_nicenum(pa->pa_size, buf_size, sizeof (buf_size)); pct = (total > 0) ? bytes / total : 100; zfs_setproctitle("sending %s (%d%%: %s/%s)", zhp->zfs_name, MIN(pct, 100), buf_bytes, buf_size); } if (pa->pa_verbosity >= 2 && pa->pa_parsable) { (void) fprintf(stderr, "%02d:%02d:%02d\t%llu\t%llu\t%s\n", tm.tm_hour, tm.tm_min, tm.tm_sec, (u_longlong_t)bytes, (u_longlong_t)blocks, zhp->zfs_name); } else if (pa->pa_verbosity >= 2) { zfs_nicenum(bytes, buf, sizeof (buf)); (void) fprintf(stderr, "%02d:%02d:%02d %5s %8llu %s\n", tm.tm_hour, tm.tm_min, tm.tm_sec, buf, (u_longlong_t)blocks, zhp->zfs_name); } else if (pa->pa_parsable) { (void) fprintf(stderr, "%02d:%02d:%02d\t%llu\t%s\n", tm.tm_hour, tm.tm_min, tm.tm_sec, (u_longlong_t)bytes, zhp->zfs_name); } else if (pa->pa_verbosity != 0) { zfs_nicebytes(bytes, buf, sizeof (buf)); (void) fprintf(stderr, "%02d:%02d:%02d %5s %s\n", tm.tm_hour, tm.tm_min, tm.tm_sec, buf, zhp->zfs_name); } } } static boolean_t send_progress_thread_exit(libzfs_handle_t *hdl, pthread_t ptid) { void *status = NULL; (void) pthread_cancel(ptid); (void) pthread_join(ptid, &status); int error = (int)(uintptr_t)status; if (error != 0 && status != PTHREAD_CANCELED) return (zfs_standard_error(hdl, error, dgettext(TEXT_DOMAIN, "progress thread exited nonzero"))); else return (B_FALSE); } static void send_print_verbose(FILE *fout, const char *tosnap, const char *fromsnap, uint64_t size, boolean_t parsable) { if (parsable) { if (fromsnap != NULL) { (void) fprintf(fout, dgettext(TEXT_DOMAIN, "incremental\t%s\t%s"), fromsnap, tosnap); } else { /* * Workaround for GCC 12+ with UBSan enabled deficencies. * * GCC 12+ invoked with -fsanitize=undefined incorrectly reports the code * below as violating -Wformat-overflow. */ #if defined(__GNUC__) && !defined(__clang__) && \ defined(ZFS_UBSAN_ENABLED) && defined(HAVE_FORMAT_OVERFLOW) #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wformat-overflow" #endif (void) fprintf(fout, dgettext(TEXT_DOMAIN, "full\t%s"), tosnap); #if defined(__GNUC__) && !defined(__clang__) && \ defined(ZFS_UBSAN_ENABLED) && defined(HAVE_FORMAT_OVERFLOW) #pragma GCC diagnostic pop #endif } (void) fprintf(fout, "\t%llu", (longlong_t)size); } else { if (fromsnap != NULL) { if (strchr(fromsnap, '@') == NULL && strchr(fromsnap, '#') == NULL) { (void) fprintf(fout, dgettext(TEXT_DOMAIN, "send from @%s to %s"), fromsnap, tosnap); } else { (void) fprintf(fout, dgettext(TEXT_DOMAIN, "send from %s to %s"), fromsnap, tosnap); } } else { (void) fprintf(fout, dgettext(TEXT_DOMAIN, "full send of %s"), tosnap); } if (size != 0) { char buf[16]; zfs_nicebytes(size, buf, sizeof (buf)); /* * Workaround for GCC 12+ with UBSan enabled deficencies. * * GCC 12+ invoked with -fsanitize=undefined incorrectly reports the code * below as violating -Wformat-overflow. */ #if defined(__GNUC__) && !defined(__clang__) && \ defined(ZFS_UBSAN_ENABLED) && defined(HAVE_FORMAT_OVERFLOW) #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wformat-overflow" #endif (void) fprintf(fout, dgettext(TEXT_DOMAIN, " estimated size is %s"), buf); #if defined(__GNUC__) && !defined(__clang__) && \ defined(ZFS_UBSAN_ENABLED) && defined(HAVE_FORMAT_OVERFLOW) #pragma GCC diagnostic pop #endif } } (void) fprintf(fout, "\n"); } /* * Send a single filesystem snapshot, updating the send dump data. * This interface is intended for use as a zfs_iter_snapshots_sorted visitor. */ static int dump_snapshot(zfs_handle_t *zhp, void *arg) { send_dump_data_t *sdd = arg; progress_arg_t pa = { 0 }; pthread_t tid; char *thissnap; enum lzc_send_flags flags = 0; int err; boolean_t isfromsnap, istosnap, fromorigin; boolean_t exclude = B_FALSE; FILE *fout = sdd->std_out ? stdout : stderr; err = 0; thissnap = strchr(zhp->zfs_name, '@') + 1; isfromsnap = (sdd->fromsnap != NULL && strcmp(sdd->fromsnap, thissnap) == 0); if (!sdd->seenfrom && isfromsnap) { gather_holds(zhp, sdd); sdd->seenfrom = B_TRUE; (void) strlcpy(sdd->prevsnap, thissnap, sizeof (sdd->prevsnap)); sdd->prevsnap_obj = zfs_prop_get_int(zhp, ZFS_PROP_OBJSETID); zfs_close(zhp); return (0); } if (sdd->seento || !sdd->seenfrom) { zfs_close(zhp); return (0); } istosnap = (strcmp(sdd->tosnap, thissnap) == 0); if (istosnap) sdd->seento = B_TRUE; if (sdd->large_block) flags |= LZC_SEND_FLAG_LARGE_BLOCK; if (sdd->embed_data) flags |= LZC_SEND_FLAG_EMBED_DATA; if (sdd->compress) flags |= LZC_SEND_FLAG_COMPRESS; if (sdd->raw) flags |= LZC_SEND_FLAG_RAW; if (!sdd->doall && !isfromsnap && !istosnap) { if (sdd->replicate) { char *snapname; nvlist_t *snapprops; /* * Filter out all intermediate snapshots except origin * snapshots needed to replicate clones. */ nvlist_t *nvfs = fsavl_find(sdd->fsavl, zhp->zfs_dmustats.dds_guid, &snapname); if (nvfs != NULL) { snapprops = fnvlist_lookup_nvlist(nvfs, "snapprops"); snapprops = fnvlist_lookup_nvlist(snapprops, thissnap); exclude = !nvlist_exists(snapprops, "is_clone_origin"); } } else { exclude = B_TRUE; } } /* * If a filter function exists, call it to determine whether * this snapshot will be sent. */ if (exclude || (sdd->filter_cb != NULL && sdd->filter_cb(zhp, sdd->filter_cb_arg) == B_FALSE)) { /* * This snapshot is filtered out. Don't send it, and don't * set prevsnap_obj, so it will be as if this snapshot didn't * exist, and the next accepted snapshot will be sent as * an incremental from the last accepted one, or as the * first (and full) snapshot in the case of a replication, * non-incremental send. */ zfs_close(zhp); return (0); } gather_holds(zhp, sdd); fromorigin = sdd->prevsnap[0] == '\0' && (sdd->fromorigin || sdd->replicate); if (sdd->verbosity != 0) { uint64_t size = 0; char fromds[ZFS_MAX_DATASET_NAME_LEN]; if (sdd->prevsnap[0] != '\0') { (void) strlcpy(fromds, zhp->zfs_name, sizeof (fromds)); *(strchr(fromds, '@') + 1) = '\0'; (void) strlcat(fromds, sdd->prevsnap, sizeof (fromds)); } if (zfs_send_space(zhp, zhp->zfs_name, sdd->prevsnap[0] ? fromds : NULL, flags, &size) == 0) { send_print_verbose(fout, zhp->zfs_name, sdd->prevsnap[0] ? sdd->prevsnap : NULL, size, sdd->parsable); sdd->size += size; } } if (!sdd->dryrun) { /* * If progress reporting is requested, spawn a new thread to * poll ZFS_IOC_SEND_PROGRESS at a regular interval. */ if (sdd->progress || sdd->progressastitle) { pa.pa_zhp = zhp; pa.pa_fd = sdd->outfd; pa.pa_parsable = sdd->parsable; pa.pa_estimate = B_FALSE; pa.pa_verbosity = sdd->verbosity; pa.pa_size = sdd->size; pa.pa_astitle = sdd->progressastitle; if ((err = pthread_create(&tid, NULL, send_progress_thread, &pa)) != 0) { zfs_close(zhp); return (err); } } err = dump_ioctl(zhp, sdd->prevsnap, sdd->prevsnap_obj, fromorigin, sdd->outfd, flags, sdd->debugnv); if ((sdd->progress || sdd->progressastitle) && send_progress_thread_exit(zhp->zfs_hdl, tid)) return (-1); } (void) strlcpy(sdd->prevsnap, thissnap, sizeof (sdd->prevsnap)); sdd->prevsnap_obj = zfs_prop_get_int(zhp, ZFS_PROP_OBJSETID); zfs_close(zhp); return (err); } /* * Send all snapshots for a filesystem, updating the send dump data. */ static int dump_filesystem(zfs_handle_t *zhp, send_dump_data_t *sdd) { int rv = 0; boolean_t missingfrom = B_FALSE; zfs_cmd_t zc = {"\0"}; uint64_t min_txg = 0, max_txg = 0; /* * Make sure the tosnap exists. */ (void) snprintf(zc.zc_name, sizeof (zc.zc_name), "%s@%s", zhp->zfs_name, sdd->tosnap); if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_OBJSET_STATS, &zc) != 0) { (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "WARNING: could not send %s@%s: does not exist\n"), zhp->zfs_name, sdd->tosnap); sdd->err = B_TRUE; return (0); } /* * If this fs does not have fromsnap, and we're doing * recursive, we need to send a full stream from the * beginning (or an incremental from the origin if this * is a clone). If we're doing non-recursive, then let * them get the error. */ if (sdd->replicate && sdd->fromsnap) { /* * Make sure the fromsnap exists. */ (void) snprintf(zc.zc_name, sizeof (zc.zc_name), "%s@%s", zhp->zfs_name, sdd->fromsnap); if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_OBJSET_STATS, &zc) != 0) missingfrom = B_TRUE; } sdd->seenfrom = sdd->seento = B_FALSE; sdd->prevsnap[0] = '\0'; sdd->prevsnap_obj = 0; if (sdd->fromsnap == NULL || missingfrom) sdd->seenfrom = B_TRUE; /* * Iterate through all snapshots and process the ones we will be * sending. If we only have a "from" and "to" snapshot to deal * with, we can avoid iterating through all the other snapshots. */ if (sdd->doall || sdd->replicate || sdd->tosnap == NULL) { if (!sdd->replicate) { if (sdd->fromsnap != NULL) { min_txg = get_snap_txg(zhp->zfs_hdl, zhp->zfs_name, sdd->fromsnap); } if (sdd->tosnap != NULL) { max_txg = get_snap_txg(zhp->zfs_hdl, zhp->zfs_name, sdd->tosnap); } } rv = zfs_iter_snapshots_sorted(zhp, 0, dump_snapshot, sdd, min_txg, max_txg); } else { char snapname[MAXPATHLEN] = { 0 }; zfs_handle_t *snap; /* Dump fromsnap. */ if (!sdd->seenfrom) { (void) snprintf(snapname, sizeof (snapname), "%s@%s", zhp->zfs_name, sdd->fromsnap); snap = zfs_open(zhp->zfs_hdl, snapname, ZFS_TYPE_SNAPSHOT); if (snap != NULL) rv = dump_snapshot(snap, sdd); else rv = errno; } /* Dump tosnap. */ if (rv == 0) { (void) snprintf(snapname, sizeof (snapname), "%s@%s", zhp->zfs_name, sdd->tosnap); snap = zfs_open(zhp->zfs_hdl, snapname, ZFS_TYPE_SNAPSHOT); if (snap != NULL) rv = dump_snapshot(snap, sdd); else rv = errno; } } if (!sdd->seenfrom) { (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "WARNING: could not send %s@%s:\n" "incremental source (%s@%s) does not exist\n"), zhp->zfs_name, sdd->tosnap, zhp->zfs_name, sdd->fromsnap); sdd->err = B_TRUE; } else if (!sdd->seento) { if (sdd->fromsnap) { (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "WARNING: could not send %s@%s:\n" "incremental source (%s@%s) " "is not earlier than it\n"), zhp->zfs_name, sdd->tosnap, zhp->zfs_name, sdd->fromsnap); } else { (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "WARNING: " "could not send %s@%s: does not exist\n"), zhp->zfs_name, sdd->tosnap); } sdd->err = B_TRUE; } return (rv); } /* * Send all snapshots for all filesystems in sdd. */ static int dump_filesystems(zfs_handle_t *rzhp, send_dump_data_t *sdd) { nvpair_t *fspair; boolean_t needagain, progress; if (!sdd->replicate) return (dump_filesystem(rzhp, sdd)); /* Mark the clone origin snapshots. */ for (fspair = nvlist_next_nvpair(sdd->fss, NULL); fspair; fspair = nvlist_next_nvpair(sdd->fss, fspair)) { nvlist_t *nvfs; uint64_t origin_guid = 0; nvfs = fnvpair_value_nvlist(fspair); (void) nvlist_lookup_uint64(nvfs, "origin", &origin_guid); if (origin_guid != 0) { char *snapname; nvlist_t *origin_nv = fsavl_find(sdd->fsavl, origin_guid, &snapname); if (origin_nv != NULL) { nvlist_t *snapprops; snapprops = fnvlist_lookup_nvlist(origin_nv, "snapprops"); snapprops = fnvlist_lookup_nvlist(snapprops, snapname); fnvlist_add_boolean(snapprops, "is_clone_origin"); } } } again: needagain = progress = B_FALSE; for (fspair = nvlist_next_nvpair(sdd->fss, NULL); fspair; fspair = nvlist_next_nvpair(sdd->fss, fspair)) { nvlist_t *fslist, *parent_nv; char *fsname; zfs_handle_t *zhp; int err; uint64_t origin_guid = 0; uint64_t parent_guid = 0; fslist = fnvpair_value_nvlist(fspair); if (nvlist_lookup_boolean(fslist, "sent") == 0) continue; fsname = fnvlist_lookup_string(fslist, "name"); (void) nvlist_lookup_uint64(fslist, "origin", &origin_guid); (void) nvlist_lookup_uint64(fslist, "parentfromsnap", &parent_guid); if (parent_guid != 0) { parent_nv = fsavl_find(sdd->fsavl, parent_guid, NULL); if (!nvlist_exists(parent_nv, "sent")) { /* Parent has not been sent; skip this one. */ needagain = B_TRUE; continue; } } if (origin_guid != 0) { nvlist_t *origin_nv = fsavl_find(sdd->fsavl, origin_guid, NULL); if (origin_nv != NULL && !nvlist_exists(origin_nv, "sent")) { /* * Origin has not been sent yet; * skip this clone. */ needagain = B_TRUE; continue; } } zhp = zfs_open(rzhp->zfs_hdl, fsname, ZFS_TYPE_DATASET); if (zhp == NULL) return (-1); err = dump_filesystem(zhp, sdd); fnvlist_add_boolean(fslist, "sent"); progress = B_TRUE; zfs_close(zhp); if (err) return (err); } if (needagain) { assert(progress); goto again; } /* Clean out the sent flags in case we reuse this fss. */ for (fspair = nvlist_next_nvpair(sdd->fss, NULL); fspair; fspair = nvlist_next_nvpair(sdd->fss, fspair)) { nvlist_t *fslist; fslist = fnvpair_value_nvlist(fspair); (void) nvlist_remove_all(fslist, "sent"); } return (0); } nvlist_t * zfs_send_resume_token_to_nvlist(libzfs_handle_t *hdl, const char *token) { unsigned int version; int nread, i; unsigned long long checksum, packed_len; /* * Decode token header, which is: * -- * Note that the only supported token version is 1. */ nread = sscanf(token, "%u-%llx-%llx-", &version, &checksum, &packed_len); if (nread != 3) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "resume token is corrupt (invalid format)")); return (NULL); } if (version != ZFS_SEND_RESUME_TOKEN_VERSION) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "resume token is corrupt (invalid version %u)"), version); return (NULL); } /* Convert hexadecimal representation to binary. */ token = strrchr(token, '-') + 1; int len = strlen(token) / 2; unsigned char *compressed = zfs_alloc(hdl, len); for (i = 0; i < len; i++) { nread = sscanf(token + i * 2, "%2hhx", compressed + i); if (nread != 1) { free(compressed); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "resume token is corrupt " "(payload is not hex-encoded)")); return (NULL); } } /* Verify checksum. */ zio_cksum_t cksum; fletcher_4_native_varsize(compressed, len, &cksum); if (cksum.zc_word[0] != checksum) { free(compressed); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "resume token is corrupt (incorrect checksum)")); return (NULL); } /* Uncompress. */ void *packed = zfs_alloc(hdl, packed_len); uLongf packed_len_long = packed_len; if (uncompress(packed, &packed_len_long, compressed, len) != Z_OK || packed_len_long != packed_len) { free(packed); free(compressed); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "resume token is corrupt (decompression failed)")); return (NULL); } /* Unpack nvlist. */ nvlist_t *nv; int error = nvlist_unpack(packed, packed_len, &nv, KM_SLEEP); free(packed); free(compressed); if (error != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "resume token is corrupt (nvlist_unpack failed)")); return (NULL); } return (nv); } static enum lzc_send_flags lzc_flags_from_sendflags(const sendflags_t *flags) { enum lzc_send_flags lzc_flags = 0; if (flags->largeblock) lzc_flags |= LZC_SEND_FLAG_LARGE_BLOCK; if (flags->embed_data) lzc_flags |= LZC_SEND_FLAG_EMBED_DATA; if (flags->compress) lzc_flags |= LZC_SEND_FLAG_COMPRESS; if (flags->raw) lzc_flags |= LZC_SEND_FLAG_RAW; if (flags->saved) lzc_flags |= LZC_SEND_FLAG_SAVED; return (lzc_flags); } static int estimate_size(zfs_handle_t *zhp, const char *from, int fd, sendflags_t *flags, uint64_t resumeobj, uint64_t resumeoff, uint64_t bytes, const char *redactbook, char *errbuf, uint64_t *sizep) { uint64_t size; FILE *fout = flags->dryrun ? stdout : stderr; progress_arg_t pa = { 0 }; int err = 0; pthread_t ptid; if (flags->progress || flags->progressastitle) { pa.pa_zhp = zhp; pa.pa_fd = fd; pa.pa_parsable = flags->parsable; pa.pa_estimate = B_TRUE; pa.pa_verbosity = flags->verbosity; err = pthread_create(&ptid, NULL, send_progress_thread, &pa); if (err != 0) { zfs_error_aux(zhp->zfs_hdl, "%s", strerror(errno)); return (zfs_error(zhp->zfs_hdl, EZFS_THREADCREATEFAILED, errbuf)); } } err = lzc_send_space_resume_redacted(zhp->zfs_name, from, lzc_flags_from_sendflags(flags), resumeobj, resumeoff, bytes, redactbook, fd, &size); *sizep = size; if ((flags->progress || flags->progressastitle) && send_progress_thread_exit(zhp->zfs_hdl, ptid)) return (-1); if (!flags->progress && !flags->parsable) return (err); if (err != 0) { zfs_error_aux(zhp->zfs_hdl, "%s", strerror(err)); return (zfs_error(zhp->zfs_hdl, EZFS_BADBACKUP, errbuf)); } send_print_verbose(fout, zhp->zfs_name, from, size, flags->parsable); if (flags->parsable) { (void) fprintf(fout, "size\t%llu\n", (longlong_t)size); } else { char buf[16]; zfs_nicenum(size, buf, sizeof (buf)); (void) fprintf(fout, dgettext(TEXT_DOMAIN, "total estimated size is %s\n"), buf); } return (0); } static boolean_t redact_snaps_contains(const uint64_t *snaps, uint64_t num_snaps, uint64_t guid) { for (int i = 0; i < num_snaps; i++) { if (snaps[i] == guid) return (B_TRUE); } return (B_FALSE); } static boolean_t redact_snaps_equal(const uint64_t *snaps1, uint64_t num_snaps1, const uint64_t *snaps2, uint64_t num_snaps2) { if (num_snaps1 != num_snaps2) return (B_FALSE); for (int i = 0; i < num_snaps1; i++) { if (!redact_snaps_contains(snaps2, num_snaps2, snaps1[i])) return (B_FALSE); } return (B_TRUE); } static int get_bookmarks(const char *path, nvlist_t **bmarksp) { nvlist_t *props = fnvlist_alloc(); int error; fnvlist_add_boolean(props, "redact_complete"); fnvlist_add_boolean(props, zfs_prop_to_name(ZFS_PROP_REDACT_SNAPS)); error = lzc_get_bookmarks(path, props, bmarksp); fnvlist_free(props); return (error); } static nvpair_t * find_redact_pair(nvlist_t *bmarks, const uint64_t *redact_snap_guids, int num_redact_snaps) { nvpair_t *pair; for (pair = nvlist_next_nvpair(bmarks, NULL); pair; pair = nvlist_next_nvpair(bmarks, pair)) { nvlist_t *bmark = fnvpair_value_nvlist(pair); nvlist_t *vallist = fnvlist_lookup_nvlist(bmark, zfs_prop_to_name(ZFS_PROP_REDACT_SNAPS)); uint_t len = 0; uint64_t *bmarksnaps = fnvlist_lookup_uint64_array(vallist, ZPROP_VALUE, &len); if (redact_snaps_equal(redact_snap_guids, num_redact_snaps, bmarksnaps, len)) { break; } } return (pair); } static boolean_t get_redact_complete(nvpair_t *pair) { nvlist_t *bmark = fnvpair_value_nvlist(pair); nvlist_t *vallist = fnvlist_lookup_nvlist(bmark, "redact_complete"); boolean_t complete = fnvlist_lookup_boolean_value(vallist, ZPROP_VALUE); return (complete); } /* * Check that the list of redaction snapshots in the bookmark matches the send * we're resuming, and return whether or not it's complete. * * Note that the caller needs to free the contents of *bookname with free() if * this function returns successfully. */ static int find_redact_book(libzfs_handle_t *hdl, const char *path, const uint64_t *redact_snap_guids, int num_redact_snaps, char **bookname) { char errbuf[ERRBUFLEN]; nvlist_t *bmarks; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot resume send")); int error = get_bookmarks(path, &bmarks); if (error != 0) { if (error == ESRCH) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "nonexistent redaction bookmark provided")); } else if (error == ENOENT) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "dataset to be sent no longer exists")); } else { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "unknown error: %s"), strerror(error)); } return (zfs_error(hdl, EZFS_BADPROP, errbuf)); } nvpair_t *pair = find_redact_pair(bmarks, redact_snap_guids, num_redact_snaps); if (pair == NULL) { fnvlist_free(bmarks); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "no appropriate redaction bookmark exists")); return (zfs_error(hdl, EZFS_BADPROP, errbuf)); } boolean_t complete = get_redact_complete(pair); if (!complete) { fnvlist_free(bmarks); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "incomplete redaction bookmark provided")); return (zfs_error(hdl, EZFS_BADPROP, errbuf)); } *bookname = strndup(nvpair_name(pair), ZFS_MAX_DATASET_NAME_LEN); ASSERT3P(*bookname, !=, NULL); fnvlist_free(bmarks); return (0); } static enum lzc_send_flags lzc_flags_from_resume_nvl(nvlist_t *resume_nvl) { enum lzc_send_flags lzc_flags = 0; if (nvlist_exists(resume_nvl, "largeblockok")) lzc_flags |= LZC_SEND_FLAG_LARGE_BLOCK; if (nvlist_exists(resume_nvl, "embedok")) lzc_flags |= LZC_SEND_FLAG_EMBED_DATA; if (nvlist_exists(resume_nvl, "compressok")) lzc_flags |= LZC_SEND_FLAG_COMPRESS; if (nvlist_exists(resume_nvl, "rawok")) lzc_flags |= LZC_SEND_FLAG_RAW; if (nvlist_exists(resume_nvl, "savedok")) lzc_flags |= LZC_SEND_FLAG_SAVED; return (lzc_flags); } static int zfs_send_resume_impl_cb_impl(libzfs_handle_t *hdl, sendflags_t *flags, int outfd, nvlist_t *resume_nvl) { char errbuf[ERRBUFLEN]; char *toname; char *fromname = NULL; uint64_t resumeobj, resumeoff, toguid, fromguid, bytes; zfs_handle_t *zhp; int error = 0; char name[ZFS_MAX_DATASET_NAME_LEN]; FILE *fout = (flags->verbosity > 0 && flags->dryrun) ? stdout : stderr; uint64_t *redact_snap_guids = NULL; int num_redact_snaps = 0; char *redact_book = NULL; uint64_t size = 0; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot resume send")); if (flags->verbosity != 0) { (void) fprintf(fout, dgettext(TEXT_DOMAIN, "resume token contents:\n")); nvlist_print(fout, resume_nvl); } if (nvlist_lookup_string(resume_nvl, "toname", &toname) != 0 || nvlist_lookup_uint64(resume_nvl, "object", &resumeobj) != 0 || nvlist_lookup_uint64(resume_nvl, "offset", &resumeoff) != 0 || nvlist_lookup_uint64(resume_nvl, "bytes", &bytes) != 0 || nvlist_lookup_uint64(resume_nvl, "toguid", &toguid) != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "resume token is corrupt")); return (zfs_error(hdl, EZFS_FAULT, errbuf)); } fromguid = 0; (void) nvlist_lookup_uint64(resume_nvl, "fromguid", &fromguid); if (flags->saved) { (void) strlcpy(name, toname, sizeof (name)); } else { error = guid_to_name(hdl, toname, toguid, B_FALSE, name); if (error != 0) { if (zfs_dataset_exists(hdl, toname, ZFS_TYPE_DATASET)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' is no longer the same snapshot " "used in the initial send"), toname); } else { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' used in the initial send no " "longer exists"), toname); } return (zfs_error(hdl, EZFS_BADPATH, errbuf)); } } zhp = zfs_open(hdl, name, ZFS_TYPE_DATASET); if (zhp == NULL) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "unable to access '%s'"), name); return (zfs_error(hdl, EZFS_BADPATH, errbuf)); } if (nvlist_lookup_uint64_array(resume_nvl, "book_redact_snaps", &redact_snap_guids, (uint_t *)&num_redact_snaps) != 0) { num_redact_snaps = -1; } if (fromguid != 0) { if (guid_to_name_redact_snaps(hdl, toname, fromguid, B_TRUE, redact_snap_guids, num_redact_snaps, name) != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "incremental source %#llx no longer exists"), (longlong_t)fromguid); return (zfs_error(hdl, EZFS_BADPATH, errbuf)); } fromname = name; } redact_snap_guids = NULL; if (nvlist_lookup_uint64_array(resume_nvl, zfs_prop_to_name(ZFS_PROP_REDACT_SNAPS), &redact_snap_guids, (uint_t *)&num_redact_snaps) == 0) { char path[ZFS_MAX_DATASET_NAME_LEN]; (void) strlcpy(path, toname, sizeof (path)); char *at = strchr(path, '@'); ASSERT3P(at, !=, NULL); *at = '\0'; if ((error = find_redact_book(hdl, path, redact_snap_guids, num_redact_snaps, &redact_book)) != 0) { return (error); } } enum lzc_send_flags lzc_flags = lzc_flags_from_sendflags(flags) | lzc_flags_from_resume_nvl(resume_nvl); if (flags->verbosity != 0 || flags->progressastitle) { /* * Some of these may have come from the resume token, set them * here for size estimate purposes. */ sendflags_t tmpflags = *flags; if (lzc_flags & LZC_SEND_FLAG_LARGE_BLOCK) tmpflags.largeblock = B_TRUE; if (lzc_flags & LZC_SEND_FLAG_COMPRESS) tmpflags.compress = B_TRUE; if (lzc_flags & LZC_SEND_FLAG_EMBED_DATA) tmpflags.embed_data = B_TRUE; if (lzc_flags & LZC_SEND_FLAG_RAW) tmpflags.raw = B_TRUE; if (lzc_flags & LZC_SEND_FLAG_SAVED) tmpflags.saved = B_TRUE; error = estimate_size(zhp, fromname, outfd, &tmpflags, resumeobj, resumeoff, bytes, redact_book, errbuf, &size); } if (!flags->dryrun) { progress_arg_t pa = { 0 }; pthread_t tid; /* * If progress reporting is requested, spawn a new thread to * poll ZFS_IOC_SEND_PROGRESS at a regular interval. */ if (flags->progress || flags->progressastitle) { pa.pa_zhp = zhp; pa.pa_fd = outfd; pa.pa_parsable = flags->parsable; pa.pa_estimate = B_FALSE; pa.pa_verbosity = flags->verbosity; pa.pa_size = size; pa.pa_astitle = flags->progressastitle; error = pthread_create(&tid, NULL, send_progress_thread, &pa); if (error != 0) { if (redact_book != NULL) free(redact_book); zfs_close(zhp); return (error); } } error = lzc_send_resume_redacted(zhp->zfs_name, fromname, outfd, lzc_flags, resumeobj, resumeoff, redact_book); if (redact_book != NULL) free(redact_book); if ((flags->progressastitle || flags->progress) && send_progress_thread_exit(hdl, tid)) { zfs_close(zhp); return (-1); } char errbuf[ERRBUFLEN]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "warning: cannot send '%s'"), zhp->zfs_name); zfs_close(zhp); switch (error) { case 0: return (0); case EACCES: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "source key must be loaded")); return (zfs_error(hdl, EZFS_CRYPTOFAILED, errbuf)); case ESRCH: if (lzc_exists(zhp->zfs_name)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "incremental source could not be found")); } return (zfs_error(hdl, EZFS_NOENT, errbuf)); case EXDEV: case ENOENT: case EDQUOT: case EFBIG: case EIO: case ENOLINK: case ENOSPC: case ENOSTR: case ENXIO: case EPIPE: case ERANGE: case EFAULT: case EROFS: zfs_error_aux(hdl, "%s", strerror(errno)); return (zfs_error(hdl, EZFS_BADBACKUP, errbuf)); default: return (zfs_standard_error(hdl, errno, errbuf)); } } else { if (redact_book != NULL) free(redact_book); } zfs_close(zhp); return (error); } struct zfs_send_resume_impl { libzfs_handle_t *hdl; sendflags_t *flags; nvlist_t *resume_nvl; }; static int zfs_send_resume_impl_cb(int outfd, void *arg) { struct zfs_send_resume_impl *zsri = arg; return (zfs_send_resume_impl_cb_impl(zsri->hdl, zsri->flags, outfd, zsri->resume_nvl)); } static int zfs_send_resume_impl(libzfs_handle_t *hdl, sendflags_t *flags, int outfd, nvlist_t *resume_nvl) { struct zfs_send_resume_impl zsri = { .hdl = hdl, .flags = flags, .resume_nvl = resume_nvl, }; return (lzc_send_wrapper(zfs_send_resume_impl_cb, outfd, &zsri)); } int zfs_send_resume(libzfs_handle_t *hdl, sendflags_t *flags, int outfd, const char *resume_token) { int ret; char errbuf[ERRBUFLEN]; nvlist_t *resume_nvl; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot resume send")); resume_nvl = zfs_send_resume_token_to_nvlist(hdl, resume_token); if (resume_nvl == NULL) { /* * zfs_error_aux has already been set by * zfs_send_resume_token_to_nvlist() */ return (zfs_error(hdl, EZFS_FAULT, errbuf)); } ret = zfs_send_resume_impl(hdl, flags, outfd, resume_nvl); fnvlist_free(resume_nvl); return (ret); } int zfs_send_saved(zfs_handle_t *zhp, sendflags_t *flags, int outfd, const char *resume_token) { int ret; libzfs_handle_t *hdl = zhp->zfs_hdl; nvlist_t *saved_nvl = NULL, *resume_nvl = NULL; uint64_t saved_guid = 0, resume_guid = 0; uint64_t obj = 0, off = 0, bytes = 0; char token_buf[ZFS_MAXPROPLEN]; char errbuf[ERRBUFLEN]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "saved send failed")); ret = zfs_prop_get(zhp, ZFS_PROP_RECEIVE_RESUME_TOKEN, token_buf, sizeof (token_buf), NULL, NULL, 0, B_TRUE); if (ret != 0) goto out; saved_nvl = zfs_send_resume_token_to_nvlist(hdl, token_buf); if (saved_nvl == NULL) { /* * zfs_error_aux has already been set by * zfs_send_resume_token_to_nvlist() */ ret = zfs_error(hdl, EZFS_FAULT, errbuf); goto out; } /* * If a resume token is provided we use the object and offset * from that instead of the default, which starts from the * beginning. */ if (resume_token != NULL) { resume_nvl = zfs_send_resume_token_to_nvlist(hdl, resume_token); if (resume_nvl == NULL) { ret = zfs_error(hdl, EZFS_FAULT, errbuf); goto out; } if (nvlist_lookup_uint64(resume_nvl, "object", &obj) != 0 || nvlist_lookup_uint64(resume_nvl, "offset", &off) != 0 || nvlist_lookup_uint64(resume_nvl, "bytes", &bytes) != 0 || nvlist_lookup_uint64(resume_nvl, "toguid", &resume_guid) != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "provided resume token is corrupt")); ret = zfs_error(hdl, EZFS_FAULT, errbuf); goto out; } if (nvlist_lookup_uint64(saved_nvl, "toguid", &saved_guid)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "dataset's resume token is corrupt")); ret = zfs_error(hdl, EZFS_FAULT, errbuf); goto out; } if (resume_guid != saved_guid) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "provided resume token does not match dataset")); ret = zfs_error(hdl, EZFS_BADBACKUP, errbuf); goto out; } } (void) nvlist_remove_all(saved_nvl, "object"); fnvlist_add_uint64(saved_nvl, "object", obj); (void) nvlist_remove_all(saved_nvl, "offset"); fnvlist_add_uint64(saved_nvl, "offset", off); (void) nvlist_remove_all(saved_nvl, "bytes"); fnvlist_add_uint64(saved_nvl, "bytes", bytes); (void) nvlist_remove_all(saved_nvl, "toname"); fnvlist_add_string(saved_nvl, "toname", zhp->zfs_name); ret = zfs_send_resume_impl(hdl, flags, outfd, saved_nvl); out: fnvlist_free(saved_nvl); fnvlist_free(resume_nvl); return (ret); } /* * This function informs the target system that the recursive send is complete. * The record is also expected in the case of a send -p. */ static int send_conclusion_record(int fd, zio_cksum_t *zc) { dmu_replay_record_t drr = { 0 }; drr.drr_type = DRR_END; if (zc != NULL) drr.drr_u.drr_end.drr_checksum = *zc; if (write(fd, &drr, sizeof (drr)) == -1) { return (errno); } return (0); } /* * This function is responsible for sending the records that contain the * necessary information for the target system's libzfs to be able to set the * properties of the filesystem being received, or to be able to prepare for * a recursive receive. * * The "zhp" argument is the handle of the snapshot we are sending * (the "tosnap"). The "from" argument is the short snapshot name (the part * after the @) of the incremental source. */ static int send_prelim_records(zfs_handle_t *zhp, const char *from, int fd, boolean_t gather_props, boolean_t recursive, boolean_t verbose, boolean_t dryrun, boolean_t raw, boolean_t replicate, boolean_t skipmissing, boolean_t backup, boolean_t holds, boolean_t props, boolean_t doall, nvlist_t **fssp, avl_tree_t **fsavlp) { int err = 0; char *packbuf = NULL; size_t buflen = 0; zio_cksum_t zc = { {0} }; int featureflags = 0; /* name of filesystem/volume that contains snapshot we are sending */ char tofs[ZFS_MAX_DATASET_NAME_LEN]; /* short name of snap we are sending */ const char *tosnap = ""; char errbuf[ERRBUFLEN]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "warning: cannot send '%s'"), zhp->zfs_name); if (zhp->zfs_type == ZFS_TYPE_FILESYSTEM && zfs_prop_get_int(zhp, ZFS_PROP_VERSION) >= ZPL_VERSION_SA) { featureflags |= DMU_BACKUP_FEATURE_SA_SPILL; } if (holds) featureflags |= DMU_BACKUP_FEATURE_HOLDS; (void) strlcpy(tofs, zhp->zfs_name, ZFS_MAX_DATASET_NAME_LEN); char *at = strchr(tofs, '@'); if (at != NULL) { *at = '\0'; tosnap = at + 1; } if (gather_props) { nvlist_t *hdrnv = fnvlist_alloc(); nvlist_t *fss = NULL; if (from != NULL) fnvlist_add_string(hdrnv, "fromsnap", from); fnvlist_add_string(hdrnv, "tosnap", tosnap); if (!recursive) fnvlist_add_boolean(hdrnv, "not_recursive"); if (raw) { fnvlist_add_boolean(hdrnv, "raw"); } if (gather_nvlist(zhp->zfs_hdl, tofs, from, tosnap, recursive, raw, doall, replicate, skipmissing, verbose, backup, holds, props, &fss, fsavlp) != 0) { return (zfs_error(zhp->zfs_hdl, EZFS_BADBACKUP, errbuf)); } /* * Do not allow the size of the properties list to exceed * the limit */ if ((fnvlist_size(fss) + fnvlist_size(hdrnv)) > zhp->zfs_hdl->libzfs_max_nvlist) { (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "warning: cannot send '%s': " "the size of the list of snapshots and properties " "is too large to be received successfully.\n" "Select a smaller number of snapshots to send.\n"), zhp->zfs_name); return (zfs_error(zhp->zfs_hdl, EZFS_NOSPC, errbuf)); } fnvlist_add_nvlist(hdrnv, "fss", fss); VERIFY0(nvlist_pack(hdrnv, &packbuf, &buflen, NV_ENCODE_XDR, 0)); if (fssp != NULL) { *fssp = fss; } else { fnvlist_free(fss); } fnvlist_free(hdrnv); } if (!dryrun) { dmu_replay_record_t drr = { 0 }; /* write first begin record */ drr.drr_type = DRR_BEGIN; drr.drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC; DMU_SET_STREAM_HDRTYPE(drr.drr_u.drr_begin. drr_versioninfo, DMU_COMPOUNDSTREAM); DMU_SET_FEATUREFLAGS(drr.drr_u.drr_begin. drr_versioninfo, featureflags); if (snprintf(drr.drr_u.drr_begin.drr_toname, sizeof (drr.drr_u.drr_begin.drr_toname), "%s@%s", tofs, tosnap) >= sizeof (drr.drr_u.drr_begin.drr_toname)) { return (zfs_error(zhp->zfs_hdl, EZFS_BADBACKUP, errbuf)); } drr.drr_payloadlen = buflen; err = dump_record(&drr, packbuf, buflen, &zc, fd); free(packbuf); if (err != 0) { zfs_error_aux(zhp->zfs_hdl, "%s", strerror(err)); return (zfs_error(zhp->zfs_hdl, EZFS_BADBACKUP, errbuf)); } err = send_conclusion_record(fd, &zc); if (err != 0) { zfs_error_aux(zhp->zfs_hdl, "%s", strerror(err)); return (zfs_error(zhp->zfs_hdl, EZFS_BADBACKUP, errbuf)); } } return (0); } /* * Generate a send stream. The "zhp" argument is the filesystem/volume * that contains the snapshot to send. The "fromsnap" argument is the * short name (the part after the '@') of the snapshot that is the * incremental source to send from (if non-NULL). The "tosnap" argument * is the short name of the snapshot to send. * * The content of the send stream is the snapshot identified by * 'tosnap'. Incremental streams are requested in two ways: * - from the snapshot identified by "fromsnap" (if non-null) or * - from the origin of the dataset identified by zhp, which must * be a clone. In this case, "fromsnap" is null and "fromorigin" * is TRUE. * * The send stream is recursive (i.e. dumps a hierarchy of snapshots) and * uses a special header (with a hdrtype field of DMU_COMPOUNDSTREAM) * if "replicate" is set. If "doall" is set, dump all the intermediate * snapshots. The DMU_COMPOUNDSTREAM header is used in the "doall" * case too. If "props" is set, send properties. * * Pre-wrapped (cf. lzc_send_wrapper()). */ static int zfs_send_cb_impl(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap, sendflags_t *flags, int outfd, snapfilter_cb_t filter_func, void *cb_arg, nvlist_t **debugnvp) { char errbuf[ERRBUFLEN]; send_dump_data_t sdd = { 0 }; int err = 0; nvlist_t *fss = NULL; avl_tree_t *fsavl = NULL; static uint64_t holdseq; int spa_version; FILE *fout; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot send '%s'"), zhp->zfs_name); if (fromsnap && fromsnap[0] == '\0') { zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, "zero-length incremental source")); return (zfs_error(zhp->zfs_hdl, EZFS_NOENT, errbuf)); } if (fromsnap) { char full_fromsnap_name[ZFS_MAX_DATASET_NAME_LEN]; if (snprintf(full_fromsnap_name, sizeof (full_fromsnap_name), "%s@%s", zhp->zfs_name, fromsnap) >= sizeof (full_fromsnap_name)) { err = EINVAL; goto stderr_out; } zfs_handle_t *fromsnapn = zfs_open(zhp->zfs_hdl, full_fromsnap_name, ZFS_TYPE_SNAPSHOT); if (fromsnapn == NULL) { err = -1; goto err_out; } zfs_close(fromsnapn); } if (flags->replicate || flags->doall || flags->props || flags->holds || flags->backup) { char full_tosnap_name[ZFS_MAX_DATASET_NAME_LEN]; if (snprintf(full_tosnap_name, sizeof (full_tosnap_name), "%s@%s", zhp->zfs_name, tosnap) >= sizeof (full_tosnap_name)) { err = EINVAL; goto stderr_out; } zfs_handle_t *tosnap = zfs_open(zhp->zfs_hdl, full_tosnap_name, ZFS_TYPE_SNAPSHOT); if (tosnap == NULL) { err = -1; goto err_out; } err = send_prelim_records(tosnap, fromsnap, outfd, flags->replicate || flags->props || flags->holds, flags->replicate, flags->verbosity > 0, flags->dryrun, flags->raw, flags->replicate, flags->skipmissing, flags->backup, flags->holds, flags->props, flags->doall, &fss, &fsavl); zfs_close(tosnap); if (err != 0) goto err_out; } /* dump each stream */ sdd.fromsnap = fromsnap; sdd.tosnap = tosnap; sdd.outfd = outfd; sdd.replicate = flags->replicate; sdd.doall = flags->doall; sdd.fromorigin = flags->fromorigin; sdd.fss = fss; sdd.fsavl = fsavl; sdd.verbosity = flags->verbosity; sdd.parsable = flags->parsable; sdd.progress = flags->progress; sdd.progressastitle = flags->progressastitle; sdd.dryrun = flags->dryrun; sdd.large_block = flags->largeblock; sdd.embed_data = flags->embed_data; sdd.compress = flags->compress; sdd.raw = flags->raw; sdd.holds = flags->holds; sdd.filter_cb = filter_func; sdd.filter_cb_arg = cb_arg; if (debugnvp) sdd.debugnv = *debugnvp; if (sdd.verbosity != 0 && sdd.dryrun) sdd.std_out = B_TRUE; fout = sdd.std_out ? stdout : stderr; /* * Some flags require that we place user holds on the datasets that are * being sent so they don't get destroyed during the send. We can skip * this step if the pool is imported read-only since the datasets cannot * be destroyed. */ if (!flags->dryrun && !zpool_get_prop_int(zfs_get_pool_handle(zhp), ZPOOL_PROP_READONLY, NULL) && zfs_spa_version(zhp, &spa_version) == 0 && spa_version >= SPA_VERSION_USERREFS && (flags->doall || flags->replicate)) { ++holdseq; (void) snprintf(sdd.holdtag, sizeof (sdd.holdtag), ".send-%d-%llu", getpid(), (u_longlong_t)holdseq); sdd.cleanup_fd = open(ZFS_DEV, O_RDWR | O_CLOEXEC); if (sdd.cleanup_fd < 0) { err = errno; goto stderr_out; } sdd.snapholds = fnvlist_alloc(); } else { sdd.cleanup_fd = -1; sdd.snapholds = NULL; } if (flags->verbosity != 0 || sdd.snapholds != NULL) { /* * Do a verbose no-op dry run to get all the verbose output * or to gather snapshot hold's before generating any data, * then do a non-verbose real run to generate the streams. */ sdd.dryrun = B_TRUE; err = dump_filesystems(zhp, &sdd); if (err != 0) goto stderr_out; if (flags->verbosity != 0) { if (flags->parsable) { (void) fprintf(fout, "size\t%llu\n", (longlong_t)sdd.size); } else { char buf[16]; zfs_nicebytes(sdd.size, buf, sizeof (buf)); (void) fprintf(fout, dgettext(TEXT_DOMAIN, "total estimated size is %s\n"), buf); } } /* Ensure no snaps found is treated as an error. */ if (!sdd.seento) { err = ENOENT; goto err_out; } /* Skip the second run if dryrun was requested. */ if (flags->dryrun) goto err_out; if (sdd.snapholds != NULL) { err = zfs_hold_nvl(zhp, sdd.cleanup_fd, sdd.snapholds); if (err != 0) goto stderr_out; fnvlist_free(sdd.snapholds); sdd.snapholds = NULL; } sdd.dryrun = B_FALSE; sdd.verbosity = 0; } err = dump_filesystems(zhp, &sdd); fsavl_destroy(fsavl); fnvlist_free(fss); /* Ensure no snaps found is treated as an error. */ if (err == 0 && !sdd.seento) err = ENOENT; if (sdd.cleanup_fd != -1) { VERIFY(0 == close(sdd.cleanup_fd)); sdd.cleanup_fd = -1; } if (!flags->dryrun && (flags->replicate || flags->doall || flags->props || flags->backup || flags->holds)) { /* * write final end record. NB: want to do this even if * there was some error, because it might not be totally * failed. */ int err2 = send_conclusion_record(outfd, NULL); if (err2 != 0) return (zfs_standard_error(zhp->zfs_hdl, err2, errbuf)); } return (err || sdd.err); stderr_out: err = zfs_standard_error(zhp->zfs_hdl, err, errbuf); err_out: fsavl_destroy(fsavl); fnvlist_free(fss); fnvlist_free(sdd.snapholds); if (sdd.cleanup_fd != -1) VERIFY(0 == close(sdd.cleanup_fd)); return (err); } struct zfs_send { zfs_handle_t *zhp; const char *fromsnap; const char *tosnap; sendflags_t *flags; snapfilter_cb_t *filter_func; void *cb_arg; nvlist_t **debugnvp; }; static int zfs_send_cb(int outfd, void *arg) { struct zfs_send *zs = arg; return (zfs_send_cb_impl(zs->zhp, zs->fromsnap, zs->tosnap, zs->flags, outfd, zs->filter_func, zs->cb_arg, zs->debugnvp)); } int zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap, sendflags_t *flags, int outfd, snapfilter_cb_t filter_func, void *cb_arg, nvlist_t **debugnvp) { struct zfs_send arg = { .zhp = zhp, .fromsnap = fromsnap, .tosnap = tosnap, .flags = flags, .filter_func = filter_func, .cb_arg = cb_arg, .debugnvp = debugnvp, }; return (lzc_send_wrapper(zfs_send_cb, outfd, &arg)); } static zfs_handle_t * name_to_dir_handle(libzfs_handle_t *hdl, const char *snapname) { char dirname[ZFS_MAX_DATASET_NAME_LEN]; (void) strlcpy(dirname, snapname, ZFS_MAX_DATASET_NAME_LEN); char *c = strchr(dirname, '@'); if (c != NULL) *c = '\0'; return (zfs_open(hdl, dirname, ZFS_TYPE_DATASET)); } /* * Returns B_TRUE if earlier is an earlier snapshot in later's timeline; either * an earlier snapshot in the same filesystem, or a snapshot before later's * origin, or it's origin's origin, etc. */ static boolean_t snapshot_is_before(zfs_handle_t *earlier, zfs_handle_t *later) { boolean_t ret; uint64_t later_txg = (later->zfs_type == ZFS_TYPE_FILESYSTEM || later->zfs_type == ZFS_TYPE_VOLUME ? UINT64_MAX : zfs_prop_get_int(later, ZFS_PROP_CREATETXG)); uint64_t earlier_txg = zfs_prop_get_int(earlier, ZFS_PROP_CREATETXG); if (earlier_txg >= later_txg) return (B_FALSE); zfs_handle_t *earlier_dir = name_to_dir_handle(earlier->zfs_hdl, earlier->zfs_name); zfs_handle_t *later_dir = name_to_dir_handle(later->zfs_hdl, later->zfs_name); if (strcmp(earlier_dir->zfs_name, later_dir->zfs_name) == 0) { zfs_close(earlier_dir); zfs_close(later_dir); return (B_TRUE); } char clonename[ZFS_MAX_DATASET_NAME_LEN]; if (zfs_prop_get(later_dir, ZFS_PROP_ORIGIN, clonename, ZFS_MAX_DATASET_NAME_LEN, NULL, NULL, 0, B_TRUE) != 0) { zfs_close(earlier_dir); zfs_close(later_dir); return (B_FALSE); } zfs_handle_t *origin = zfs_open(earlier->zfs_hdl, clonename, ZFS_TYPE_DATASET); uint64_t origin_txg = zfs_prop_get_int(origin, ZFS_PROP_CREATETXG); /* * If "earlier" is exactly the origin, then * snapshot_is_before(earlier, origin) will return false (because * they're the same). */ if (origin_txg == earlier_txg && strcmp(origin->zfs_name, earlier->zfs_name) == 0) { zfs_close(earlier_dir); zfs_close(later_dir); zfs_close(origin); return (B_TRUE); } zfs_close(earlier_dir); zfs_close(later_dir); ret = snapshot_is_before(earlier, origin); zfs_close(origin); return (ret); } /* * The "zhp" argument is the handle of the dataset to send (typically a * snapshot). The "from" argument is the full name of the snapshot or * bookmark that is the incremental source. * * Pre-wrapped (cf. lzc_send_wrapper()). */ static int zfs_send_one_cb_impl(zfs_handle_t *zhp, const char *from, int fd, sendflags_t *flags, const char *redactbook) { int err; libzfs_handle_t *hdl = zhp->zfs_hdl; char *name = zhp->zfs_name; pthread_t ptid; progress_arg_t pa = { 0 }; uint64_t size = 0; char errbuf[ERRBUFLEN]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "warning: cannot send '%s'"), name); if (from != NULL && strchr(from, '@')) { zfs_handle_t *from_zhp = zfs_open(hdl, from, ZFS_TYPE_DATASET); if (from_zhp == NULL) return (-1); if (!snapshot_is_before(from_zhp, zhp)) { zfs_close(from_zhp); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "not an earlier snapshot from the same fs")); return (zfs_error(hdl, EZFS_CROSSTARGET, errbuf)); } zfs_close(from_zhp); } if (redactbook != NULL) { char bookname[ZFS_MAX_DATASET_NAME_LEN]; nvlist_t *redact_snaps; zfs_handle_t *book_zhp; char *at, *pound; int dsnamelen; pound = strchr(redactbook, '#'); if (pound != NULL) redactbook = pound + 1; at = strchr(name, '@'); if (at == NULL) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot do a redacted send to a filesystem")); return (zfs_error(hdl, EZFS_BADTYPE, errbuf)); } dsnamelen = at - name; if (snprintf(bookname, sizeof (bookname), "%.*s#%s", dsnamelen, name, redactbook) >= sizeof (bookname)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid bookmark name")); return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); } book_zhp = zfs_open(hdl, bookname, ZFS_TYPE_BOOKMARK); if (book_zhp == NULL) return (-1); if (nvlist_lookup_nvlist(book_zhp->zfs_props, zfs_prop_to_name(ZFS_PROP_REDACT_SNAPS), &redact_snaps) != 0 || redact_snaps == NULL) { zfs_close(book_zhp); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "not a redaction bookmark")); return (zfs_error(hdl, EZFS_BADTYPE, errbuf)); } zfs_close(book_zhp); } /* * Send fs properties */ if (flags->props || flags->holds || flags->backup) { /* * Note: the header generated by send_prelim_records() * assumes that the incremental source is in the same * filesystem/volume as the target (which is a requirement * when doing "zfs send -R"). But that isn't always the * case here (e.g. send from snap in origin, or send from * bookmark). We pass from=NULL, which will omit this * information from the prelim records; it isn't used * when receiving this type of stream. */ err = send_prelim_records(zhp, NULL, fd, B_TRUE, B_FALSE, flags->verbosity > 0, flags->dryrun, flags->raw, flags->replicate, B_FALSE, flags->backup, flags->holds, flags->props, flags->doall, NULL, NULL); if (err != 0) return (err); } /* * Perform size estimate if verbose was specified. */ if (flags->verbosity != 0 || flags->progressastitle) { err = estimate_size(zhp, from, fd, flags, 0, 0, 0, redactbook, errbuf, &size); if (err != 0) return (err); } if (flags->dryrun) return (0); /* * If progress reporting is requested, spawn a new thread to poll * ZFS_IOC_SEND_PROGRESS at a regular interval. */ if (flags->progress || flags->progressastitle) { pa.pa_zhp = zhp; pa.pa_fd = fd; pa.pa_parsable = flags->parsable; pa.pa_estimate = B_FALSE; pa.pa_verbosity = flags->verbosity; pa.pa_size = size; pa.pa_astitle = flags->progressastitle; err = pthread_create(&ptid, NULL, send_progress_thread, &pa); if (err != 0) { zfs_error_aux(zhp->zfs_hdl, "%s", strerror(errno)); return (zfs_error(zhp->zfs_hdl, EZFS_THREADCREATEFAILED, errbuf)); } } err = lzc_send_redacted(name, from, fd, lzc_flags_from_sendflags(flags), redactbook); if ((flags->progress || flags->progressastitle) && send_progress_thread_exit(hdl, ptid)) return (-1); if (err == 0 && (flags->props || flags->holds || flags->backup)) { /* Write the final end record. */ err = send_conclusion_record(fd, NULL); if (err != 0) return (zfs_standard_error(hdl, err, errbuf)); } if (err != 0) { switch (errno) { case EXDEV: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "not an earlier snapshot from the same fs")); return (zfs_error(hdl, EZFS_CROSSTARGET, errbuf)); case ENOENT: case ESRCH: if (lzc_exists(name)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "incremental source (%s) does not exist"), from); } return (zfs_error(hdl, EZFS_NOENT, errbuf)); case EACCES: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "dataset key must be loaded")); return (zfs_error(hdl, EZFS_CRYPTOFAILED, errbuf)); case EBUSY: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "target is busy; if a filesystem, " "it must not be mounted")); return (zfs_error(hdl, EZFS_BUSY, errbuf)); case EDQUOT: case EFAULT: case EFBIG: case EINVAL: case EIO: case ENOLINK: case ENOSPC: case ENOSTR: case ENXIO: case EPIPE: case ERANGE: case EROFS: zfs_error_aux(hdl, "%s", strerror(errno)); return (zfs_error(hdl, EZFS_BADBACKUP, errbuf)); default: return (zfs_standard_error(hdl, errno, errbuf)); } } return (err != 0); } struct zfs_send_one { zfs_handle_t *zhp; const char *from; sendflags_t *flags; const char *redactbook; }; static int zfs_send_one_cb(int fd, void *arg) { struct zfs_send_one *zso = arg; return (zfs_send_one_cb_impl(zso->zhp, zso->from, fd, zso->flags, zso->redactbook)); } int zfs_send_one(zfs_handle_t *zhp, const char *from, int fd, sendflags_t *flags, const char *redactbook) { struct zfs_send_one zso = { .zhp = zhp, .from = from, .flags = flags, .redactbook = redactbook, }; return (lzc_send_wrapper(zfs_send_one_cb, fd, &zso)); } /* * Routines specific to "zfs recv" */ static int recv_read(libzfs_handle_t *hdl, int fd, void *buf, int ilen, boolean_t byteswap, zio_cksum_t *zc) { char *cp = buf; int rv; int len = ilen; do { rv = read(fd, cp, len); cp += rv; len -= rv; } while (rv > 0); if (rv < 0 || len != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "failed to read from stream")); return (zfs_error(hdl, EZFS_BADSTREAM, dgettext(TEXT_DOMAIN, "cannot receive"))); } if (zc) { if (byteswap) fletcher_4_incremental_byteswap(buf, ilen, zc); else fletcher_4_incremental_native(buf, ilen, zc); } return (0); } static int recv_read_nvlist(libzfs_handle_t *hdl, int fd, int len, nvlist_t **nvp, boolean_t byteswap, zio_cksum_t *zc) { char *buf; int err; buf = zfs_alloc(hdl, len); if (len > hdl->libzfs_max_nvlist) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "nvlist too large")); free(buf); return (ENOMEM); } err = recv_read(hdl, fd, buf, len, byteswap, zc); if (err != 0) { free(buf); return (err); } err = nvlist_unpack(buf, len, nvp, 0); free(buf); if (err != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid " "stream (malformed nvlist)")); return (EINVAL); } return (0); } /* * Returns the grand origin (origin of origin of origin...) of a given handle. * If this dataset is not a clone, it simply returns a copy of the original * handle. */ static zfs_handle_t * recv_open_grand_origin(zfs_handle_t *zhp) { char origin[ZFS_MAX_DATASET_NAME_LEN]; zprop_source_t src; zfs_handle_t *ozhp = zfs_handle_dup(zhp); while (ozhp != NULL) { if (zfs_prop_get(ozhp, ZFS_PROP_ORIGIN, origin, sizeof (origin), &src, NULL, 0, B_FALSE) != 0) break; (void) zfs_close(ozhp); ozhp = zfs_open(zhp->zfs_hdl, origin, ZFS_TYPE_FILESYSTEM); } return (ozhp); } static int recv_rename_impl(zfs_handle_t *zhp, const char *name, const char *newname) { int err; zfs_handle_t *ozhp = NULL; /* * Attempt to rename the dataset. If it fails with EACCES we have * attempted to rename the dataset outside of its encryption root. * Force the dataset to become an encryption root and try again. */ err = lzc_rename(name, newname); if (err == EACCES) { ozhp = recv_open_grand_origin(zhp); if (ozhp == NULL) { err = ENOENT; goto out; } err = lzc_change_key(ozhp->zfs_name, DCP_CMD_FORCE_NEW_KEY, NULL, NULL, 0); if (err != 0) goto out; err = lzc_rename(name, newname); } out: if (ozhp != NULL) zfs_close(ozhp); return (err); } static int recv_rename(libzfs_handle_t *hdl, const char *name, const char *tryname, int baselen, char *newname, recvflags_t *flags) { static int seq; int err; prop_changelist_t *clp = NULL; zfs_handle_t *zhp = NULL; zhp = zfs_open(hdl, name, ZFS_TYPE_DATASET); if (zhp == NULL) { err = -1; goto out; } clp = changelist_gather(zhp, ZFS_PROP_NAME, 0, flags->force ? MS_FORCE : 0); if (clp == NULL) { err = -1; goto out; } err = changelist_prefix(clp); if (err) goto out; if (tryname) { (void) strlcpy(newname, tryname, ZFS_MAX_DATASET_NAME_LEN); if (flags->verbose) { (void) printf("attempting rename %s to %s\n", name, newname); } err = recv_rename_impl(zhp, name, newname); if (err == 0) changelist_rename(clp, name, tryname); } else { err = ENOENT; } if (err != 0 && strncmp(name + baselen, "recv-", 5) != 0) { seq++; (void) snprintf(newname, ZFS_MAX_DATASET_NAME_LEN, "%.*srecv-%u-%u", baselen, name, getpid(), seq); if (flags->verbose) { (void) printf("failed - trying rename %s to %s\n", name, newname); } err = recv_rename_impl(zhp, name, newname); if (err == 0) changelist_rename(clp, name, newname); if (err && flags->verbose) { (void) printf("failed (%u) - " "will try again on next pass\n", errno); } err = EAGAIN; } else if (flags->verbose) { if (err == 0) (void) printf("success\n"); else (void) printf("failed (%u)\n", errno); } (void) changelist_postfix(clp); out: if (clp != NULL) changelist_free(clp); if (zhp != NULL) zfs_close(zhp); return (err); } static int recv_promote(libzfs_handle_t *hdl, const char *fsname, const char *origin_fsname, recvflags_t *flags) { int err; zfs_cmd_t zc = {"\0"}; zfs_handle_t *zhp = NULL, *ozhp = NULL; if (flags->verbose) (void) printf("promoting %s\n", fsname); (void) strlcpy(zc.zc_value, origin_fsname, sizeof (zc.zc_value)); (void) strlcpy(zc.zc_name, fsname, sizeof (zc.zc_name)); /* * Attempt to promote the dataset. If it fails with EACCES the * promotion would cause this dataset to leave its encryption root. * Force the origin to become an encryption root and try again. */ err = zfs_ioctl(hdl, ZFS_IOC_PROMOTE, &zc); if (err == EACCES) { zhp = zfs_open(hdl, fsname, ZFS_TYPE_DATASET); if (zhp == NULL) { err = -1; goto out; } ozhp = recv_open_grand_origin(zhp); if (ozhp == NULL) { err = -1; goto out; } err = lzc_change_key(ozhp->zfs_name, DCP_CMD_FORCE_NEW_KEY, NULL, NULL, 0); if (err != 0) goto out; err = zfs_ioctl(hdl, ZFS_IOC_PROMOTE, &zc); } out: if (zhp != NULL) zfs_close(zhp); if (ozhp != NULL) zfs_close(ozhp); return (err); } static int recv_destroy(libzfs_handle_t *hdl, const char *name, int baselen, char *newname, recvflags_t *flags) { int err = 0; prop_changelist_t *clp; zfs_handle_t *zhp; boolean_t defer = B_FALSE; int spa_version; zhp = zfs_open(hdl, name, ZFS_TYPE_DATASET); if (zhp == NULL) return (-1); zfs_type_t type = zfs_get_type(zhp); if (type == ZFS_TYPE_SNAPSHOT && zfs_spa_version(zhp, &spa_version) == 0 && spa_version >= SPA_VERSION_USERREFS) defer = B_TRUE; clp = changelist_gather(zhp, ZFS_PROP_NAME, 0, flags->force ? MS_FORCE : 0); zfs_close(zhp); if (clp == NULL) return (-1); err = changelist_prefix(clp); if (err) return (err); if (flags->verbose) (void) printf("attempting destroy %s\n", name); if (type == ZFS_TYPE_SNAPSHOT) { nvlist_t *nv = fnvlist_alloc(); fnvlist_add_boolean(nv, name); err = lzc_destroy_snaps(nv, defer, NULL); fnvlist_free(nv); } else { err = lzc_destroy(name); } if (err == 0) { if (flags->verbose) (void) printf("success\n"); changelist_remove(clp, name); } (void) changelist_postfix(clp); changelist_free(clp); /* * Deferred destroy might destroy the snapshot or only mark it to be * destroyed later, and it returns success in either case. */ if (err != 0 || (defer && zfs_dataset_exists(hdl, name, ZFS_TYPE_SNAPSHOT))) { err = recv_rename(hdl, name, NULL, baselen, newname, flags); } return (err); } typedef struct guid_to_name_data { uint64_t guid; boolean_t bookmark_ok; char *name; char *skip; uint64_t *redact_snap_guids; uint64_t num_redact_snaps; } guid_to_name_data_t; static boolean_t redact_snaps_match(zfs_handle_t *zhp, guid_to_name_data_t *gtnd) { uint64_t *bmark_snaps; uint_t bmark_num_snaps; nvlist_t *nvl; if (zhp->zfs_type != ZFS_TYPE_BOOKMARK) return (B_FALSE); nvl = fnvlist_lookup_nvlist(zhp->zfs_props, zfs_prop_to_name(ZFS_PROP_REDACT_SNAPS)); bmark_snaps = fnvlist_lookup_uint64_array(nvl, ZPROP_VALUE, &bmark_num_snaps); if (bmark_num_snaps != gtnd->num_redact_snaps) return (B_FALSE); int i = 0; for (; i < bmark_num_snaps; i++) { int j = 0; for (; j < bmark_num_snaps; j++) { if (bmark_snaps[i] == gtnd->redact_snap_guids[j]) break; } if (j == bmark_num_snaps) break; } return (i == bmark_num_snaps); } static int guid_to_name_cb(zfs_handle_t *zhp, void *arg) { guid_to_name_data_t *gtnd = arg; const char *slash; int err; if (gtnd->skip != NULL && (slash = strrchr(zhp->zfs_name, '/')) != NULL && strcmp(slash + 1, gtnd->skip) == 0) { zfs_close(zhp); return (0); } if (zfs_prop_get_int(zhp, ZFS_PROP_GUID) == gtnd->guid && (gtnd->num_redact_snaps == -1 || redact_snaps_match(zhp, gtnd))) { (void) strcpy(gtnd->name, zhp->zfs_name); zfs_close(zhp); return (EEXIST); } err = zfs_iter_children(zhp, 0, guid_to_name_cb, gtnd); if (err != EEXIST && gtnd->bookmark_ok) err = zfs_iter_bookmarks(zhp, 0, guid_to_name_cb, gtnd); zfs_close(zhp); return (err); } /* * Attempt to find the local dataset associated with this guid. In the case of * multiple matches, we attempt to find the "best" match by searching * progressively larger portions of the hierarchy. This allows one to send a * tree of datasets individually and guarantee that we will find the source * guid within that hierarchy, even if there are multiple matches elsewhere. * * If num_redact_snaps is not -1, we attempt to find a redaction bookmark with * the specified number of redaction snapshots. If num_redact_snaps isn't 0 or * -1, then redact_snap_guids will be an array of the guids of the snapshots the * redaction bookmark was created with. If num_redact_snaps is -1, then we will * attempt to find a snapshot or bookmark (if bookmark_ok is passed) with the * given guid. Note that a redaction bookmark can be returned if * num_redact_snaps == -1. */ static int guid_to_name_redact_snaps(libzfs_handle_t *hdl, const char *parent, uint64_t guid, boolean_t bookmark_ok, uint64_t *redact_snap_guids, uint64_t num_redact_snaps, char *name) { char pname[ZFS_MAX_DATASET_NAME_LEN]; guid_to_name_data_t gtnd; gtnd.guid = guid; gtnd.bookmark_ok = bookmark_ok; gtnd.name = name; gtnd.skip = NULL; gtnd.redact_snap_guids = redact_snap_guids; gtnd.num_redact_snaps = num_redact_snaps; /* * Search progressively larger portions of the hierarchy, starting * with the filesystem specified by 'parent'. This will * select the "most local" version of the origin snapshot in the case * that there are multiple matching snapshots in the system. */ (void) strlcpy(pname, parent, sizeof (pname)); char *cp = strrchr(pname, '@'); if (cp == NULL) cp = strchr(pname, '\0'); for (; cp != NULL; cp = strrchr(pname, '/')) { /* Chop off the last component and open the parent */ *cp = '\0'; zfs_handle_t *zhp = make_dataset_handle(hdl, pname); if (zhp == NULL) continue; int err = guid_to_name_cb(zfs_handle_dup(zhp), >nd); if (err != EEXIST) err = zfs_iter_children(zhp, 0, guid_to_name_cb, >nd); if (err != EEXIST && bookmark_ok) err = zfs_iter_bookmarks(zhp, 0, guid_to_name_cb, >nd); zfs_close(zhp); if (err == EEXIST) return (0); /* * Remember the last portion of the dataset so we skip it next * time through (as we've already searched that portion of the * hierarchy). */ gtnd.skip = strrchr(pname, '/') + 1; } return (ENOENT); } static int guid_to_name(libzfs_handle_t *hdl, const char *parent, uint64_t guid, boolean_t bookmark_ok, char *name) { return (guid_to_name_redact_snaps(hdl, parent, guid, bookmark_ok, NULL, -1, name)); } /* * Return +1 if guid1 is before guid2, 0 if they are the same, and -1 if * guid1 is after guid2. */ static int created_before(libzfs_handle_t *hdl, avl_tree_t *avl, uint64_t guid1, uint64_t guid2) { nvlist_t *nvfs; char *fsname = NULL, *snapname = NULL; char buf[ZFS_MAX_DATASET_NAME_LEN]; int rv; zfs_handle_t *guid1hdl, *guid2hdl; uint64_t create1, create2; if (guid2 == 0) return (0); if (guid1 == 0) return (1); nvfs = fsavl_find(avl, guid1, &snapname); fsname = fnvlist_lookup_string(nvfs, "name"); (void) snprintf(buf, sizeof (buf), "%s@%s", fsname, snapname); guid1hdl = zfs_open(hdl, buf, ZFS_TYPE_SNAPSHOT); if (guid1hdl == NULL) return (-1); nvfs = fsavl_find(avl, guid2, &snapname); fsname = fnvlist_lookup_string(nvfs, "name"); (void) snprintf(buf, sizeof (buf), "%s@%s", fsname, snapname); guid2hdl = zfs_open(hdl, buf, ZFS_TYPE_SNAPSHOT); if (guid2hdl == NULL) { zfs_close(guid1hdl); return (-1); } create1 = zfs_prop_get_int(guid1hdl, ZFS_PROP_CREATETXG); create2 = zfs_prop_get_int(guid2hdl, ZFS_PROP_CREATETXG); if (create1 < create2) rv = -1; else if (create1 > create2) rv = +1; else rv = 0; zfs_close(guid1hdl); zfs_close(guid2hdl); return (rv); } /* * This function reestablishes the hierarchy of encryption roots after a * recursive incremental receive has completed. This must be done after the * second call to recv_incremental_replication() has renamed and promoted all * sent datasets to their final locations in the dataset hierarchy. */ static int recv_fix_encryption_hierarchy(libzfs_handle_t *hdl, const char *top_zfs, nvlist_t *stream_nv) { int err; nvpair_t *fselem = NULL; nvlist_t *stream_fss; stream_fss = fnvlist_lookup_nvlist(stream_nv, "fss"); while ((fselem = nvlist_next_nvpair(stream_fss, fselem)) != NULL) { zfs_handle_t *zhp = NULL; uint64_t crypt; nvlist_t *snaps, *props, *stream_nvfs = NULL; nvpair_t *snapel = NULL; boolean_t is_encroot, is_clone, stream_encroot; char *cp; char *stream_keylocation = NULL; char keylocation[MAXNAMELEN]; char fsname[ZFS_MAX_DATASET_NAME_LEN]; keylocation[0] = '\0'; stream_nvfs = fnvpair_value_nvlist(fselem); snaps = fnvlist_lookup_nvlist(stream_nvfs, "snaps"); props = fnvlist_lookup_nvlist(stream_nvfs, "props"); stream_encroot = nvlist_exists(stream_nvfs, "is_encroot"); /* find a snapshot from the stream that exists locally */ err = ENOENT; while ((snapel = nvlist_next_nvpair(snaps, snapel)) != NULL) { uint64_t guid; guid = fnvpair_value_uint64(snapel); err = guid_to_name(hdl, top_zfs, guid, B_FALSE, fsname); if (err == 0) break; } if (err != 0) continue; cp = strchr(fsname, '@'); if (cp != NULL) *cp = '\0'; zhp = zfs_open(hdl, fsname, ZFS_TYPE_DATASET); if (zhp == NULL) { err = ENOENT; goto error; } crypt = zfs_prop_get_int(zhp, ZFS_PROP_ENCRYPTION); is_clone = zhp->zfs_dmustats.dds_origin[0] != '\0'; (void) zfs_crypto_get_encryption_root(zhp, &is_encroot, NULL); /* we don't need to do anything for unencrypted datasets */ if (crypt == ZIO_CRYPT_OFF) { zfs_close(zhp); continue; } /* * If the dataset is flagged as an encryption root, was not * received as a clone and is not currently an encryption root, * force it to become one. Fixup the keylocation if necessary. */ if (stream_encroot) { if (!is_clone && !is_encroot) { err = lzc_change_key(fsname, DCP_CMD_FORCE_NEW_KEY, NULL, NULL, 0); if (err != 0) { zfs_close(zhp); goto error; } } stream_keylocation = fnvlist_lookup_string(props, zfs_prop_to_name(ZFS_PROP_KEYLOCATION)); /* * Refresh the properties in case the call to * lzc_change_key() changed the value. */ zfs_refresh_properties(zhp); err = zfs_prop_get(zhp, ZFS_PROP_KEYLOCATION, keylocation, sizeof (keylocation), NULL, NULL, 0, B_TRUE); if (err != 0) { zfs_close(zhp); goto error; } if (strcmp(keylocation, stream_keylocation) != 0) { err = zfs_prop_set(zhp, zfs_prop_to_name(ZFS_PROP_KEYLOCATION), stream_keylocation); if (err != 0) { zfs_close(zhp); goto error; } } } /* * If the dataset is not flagged as an encryption root and is * currently an encryption root, force it to inherit from its * parent. The root of a raw send should never be * force-inherited. */ if (!stream_encroot && is_encroot && strcmp(top_zfs, fsname) != 0) { err = lzc_change_key(fsname, DCP_CMD_FORCE_INHERIT, NULL, NULL, 0); if (err != 0) { zfs_close(zhp); goto error; } } zfs_close(zhp); } return (0); error: return (err); } static int recv_incremental_replication(libzfs_handle_t *hdl, const char *tofs, recvflags_t *flags, nvlist_t *stream_nv, avl_tree_t *stream_avl, nvlist_t *renamed) { nvlist_t *local_nv, *deleted = NULL; avl_tree_t *local_avl; nvpair_t *fselem, *nextfselem; char *fromsnap; char newname[ZFS_MAX_DATASET_NAME_LEN]; char guidname[32]; int error; boolean_t needagain, progress, recursive; char *s1, *s2; fromsnap = fnvlist_lookup_string(stream_nv, "fromsnap"); recursive = (nvlist_lookup_boolean(stream_nv, "not_recursive") == ENOENT); if (flags->dryrun) return (0); again: needagain = progress = B_FALSE; deleted = fnvlist_alloc(); if ((error = gather_nvlist(hdl, tofs, fromsnap, NULL, recursive, B_TRUE, B_FALSE, recursive, B_FALSE, B_FALSE, B_FALSE, B_FALSE, B_TRUE, &local_nv, &local_avl)) != 0) return (error); /* * Process deletes and renames */ for (fselem = nvlist_next_nvpair(local_nv, NULL); fselem; fselem = nextfselem) { nvlist_t *nvfs, *snaps; nvlist_t *stream_nvfs = NULL; nvpair_t *snapelem, *nextsnapelem; uint64_t fromguid = 0; uint64_t originguid = 0; uint64_t stream_originguid = 0; uint64_t parent_fromsnap_guid, stream_parent_fromsnap_guid; char *fsname, *stream_fsname; nextfselem = nvlist_next_nvpair(local_nv, fselem); nvfs = fnvpair_value_nvlist(fselem); snaps = fnvlist_lookup_nvlist(nvfs, "snaps"); fsname = fnvlist_lookup_string(nvfs, "name"); parent_fromsnap_guid = fnvlist_lookup_uint64(nvfs, "parentfromsnap"); (void) nvlist_lookup_uint64(nvfs, "origin", &originguid); /* * First find the stream's fs, so we can check for * a different origin (due to "zfs promote") */ for (snapelem = nvlist_next_nvpair(snaps, NULL); snapelem; snapelem = nvlist_next_nvpair(snaps, snapelem)) { uint64_t thisguid; thisguid = fnvpair_value_uint64(snapelem); stream_nvfs = fsavl_find(stream_avl, thisguid, NULL); if (stream_nvfs != NULL) break; } /* check for promote */ (void) nvlist_lookup_uint64(stream_nvfs, "origin", &stream_originguid); if (stream_nvfs && originguid != stream_originguid) { switch (created_before(hdl, local_avl, stream_originguid, originguid)) { case 1: { /* promote it! */ nvlist_t *origin_nvfs; char *origin_fsname; origin_nvfs = fsavl_find(local_avl, originguid, NULL); origin_fsname = fnvlist_lookup_string( origin_nvfs, "name"); error = recv_promote(hdl, fsname, origin_fsname, flags); if (error == 0) progress = B_TRUE; break; } default: break; case -1: fsavl_destroy(local_avl); fnvlist_free(local_nv); return (-1); } /* * We had/have the wrong origin, therefore our * list of snapshots is wrong. Need to handle * them on the next pass. */ needagain = B_TRUE; continue; } for (snapelem = nvlist_next_nvpair(snaps, NULL); snapelem; snapelem = nextsnapelem) { uint64_t thisguid; char *stream_snapname; nvlist_t *found, *props; nextsnapelem = nvlist_next_nvpair(snaps, snapelem); thisguid = fnvpair_value_uint64(snapelem); found = fsavl_find(stream_avl, thisguid, &stream_snapname); /* check for delete */ if (found == NULL) { char name[ZFS_MAX_DATASET_NAME_LEN]; if (!flags->force) continue; (void) snprintf(name, sizeof (name), "%s@%s", fsname, nvpair_name(snapelem)); error = recv_destroy(hdl, name, strlen(fsname)+1, newname, flags); if (error) needagain = B_TRUE; else progress = B_TRUE; sprintf(guidname, "%llu", (u_longlong_t)thisguid); nvlist_add_boolean(deleted, guidname); continue; } stream_nvfs = found; if (0 == nvlist_lookup_nvlist(stream_nvfs, "snapprops", &props) && 0 == nvlist_lookup_nvlist(props, stream_snapname, &props)) { zfs_cmd_t zc = {"\0"}; zc.zc_cookie = B_TRUE; /* received */ (void) snprintf(zc.zc_name, sizeof (zc.zc_name), "%s@%s", fsname, nvpair_name(snapelem)); zcmd_write_src_nvlist(hdl, &zc, props); (void) zfs_ioctl(hdl, ZFS_IOC_SET_PROP, &zc); zcmd_free_nvlists(&zc); } /* check for different snapname */ if (strcmp(nvpair_name(snapelem), stream_snapname) != 0) { char name[ZFS_MAX_DATASET_NAME_LEN]; char tryname[ZFS_MAX_DATASET_NAME_LEN]; (void) snprintf(name, sizeof (name), "%s@%s", fsname, nvpair_name(snapelem)); (void) snprintf(tryname, sizeof (name), "%s@%s", fsname, stream_snapname); error = recv_rename(hdl, name, tryname, strlen(fsname)+1, newname, flags); if (error) needagain = B_TRUE; else progress = B_TRUE; } if (strcmp(stream_snapname, fromsnap) == 0) fromguid = thisguid; } /* check for delete */ if (stream_nvfs == NULL) { if (!flags->force) continue; error = recv_destroy(hdl, fsname, strlen(tofs)+1, newname, flags); if (error) needagain = B_TRUE; else progress = B_TRUE; sprintf(guidname, "%llu", (u_longlong_t)parent_fromsnap_guid); nvlist_add_boolean(deleted, guidname); continue; } if (fromguid == 0) { if (flags->verbose) { (void) printf("local fs %s does not have " "fromsnap (%s in stream); must have " "been deleted locally; ignoring\n", fsname, fromsnap); } continue; } stream_fsname = fnvlist_lookup_string(stream_nvfs, "name"); stream_parent_fromsnap_guid = fnvlist_lookup_uint64( stream_nvfs, "parentfromsnap"); s1 = strrchr(fsname, '/'); s2 = strrchr(stream_fsname, '/'); /* * Check if we're going to rename based on parent guid change * and the current parent guid was also deleted. If it was then * rename will fail and is likely unneeded, so avoid this and * force an early retry to determine the new * parent_fromsnap_guid. */ if (stream_parent_fromsnap_guid != 0 && parent_fromsnap_guid != 0 && stream_parent_fromsnap_guid != parent_fromsnap_guid) { sprintf(guidname, "%llu", (u_longlong_t)parent_fromsnap_guid); if (nvlist_exists(deleted, guidname)) { progress = B_TRUE; needagain = B_TRUE; goto doagain; } } /* * Check for rename. If the exact receive path is specified, it * does not count as a rename, but we still need to check the * datasets beneath it. */ if ((stream_parent_fromsnap_guid != 0 && parent_fromsnap_guid != 0 && stream_parent_fromsnap_guid != parent_fromsnap_guid) || ((flags->isprefix || strcmp(tofs, fsname) != 0) && (s1 != NULL) && (s2 != NULL) && strcmp(s1, s2) != 0)) { nvlist_t *parent; char tryname[ZFS_MAX_DATASET_NAME_LEN]; parent = fsavl_find(local_avl, stream_parent_fromsnap_guid, NULL); /* * NB: parent might not be found if we used the * tosnap for stream_parent_fromsnap_guid, * because the parent is a newly-created fs; * we'll be able to rename it after we recv the * new fs. */ if (parent != NULL) { char *pname; pname = fnvlist_lookup_string(parent, "name"); (void) snprintf(tryname, sizeof (tryname), "%s%s", pname, strrchr(stream_fsname, '/')); } else { tryname[0] = '\0'; if (flags->verbose) { (void) printf("local fs %s new parent " "not found\n", fsname); } } newname[0] = '\0'; error = recv_rename(hdl, fsname, tryname, strlen(tofs)+1, newname, flags); if (renamed != NULL && newname[0] != '\0') { fnvlist_add_boolean(renamed, newname); } if (error) needagain = B_TRUE; else progress = B_TRUE; } } doagain: fsavl_destroy(local_avl); fnvlist_free(local_nv); fnvlist_free(deleted); if (needagain && progress) { /* do another pass to fix up temporary names */ if (flags->verbose) (void) printf("another pass:\n"); goto again; } return (needagain || error != 0); } static int zfs_receive_package(libzfs_handle_t *hdl, int fd, const char *destname, recvflags_t *flags, dmu_replay_record_t *drr, zio_cksum_t *zc, char **top_zfs, nvlist_t *cmdprops) { nvlist_t *stream_nv = NULL; avl_tree_t *stream_avl = NULL; char *fromsnap = NULL; char *sendsnap = NULL; char *cp; char tofs[ZFS_MAX_DATASET_NAME_LEN]; char sendfs[ZFS_MAX_DATASET_NAME_LEN]; char errbuf[ERRBUFLEN]; dmu_replay_record_t drre; int error; boolean_t anyerr = B_FALSE; boolean_t softerr = B_FALSE; boolean_t recursive, raw; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot receive")); assert(drr->drr_type == DRR_BEGIN); assert(drr->drr_u.drr_begin.drr_magic == DMU_BACKUP_MAGIC); assert(DMU_GET_STREAM_HDRTYPE(drr->drr_u.drr_begin.drr_versioninfo) == DMU_COMPOUNDSTREAM); /* * Read in the nvlist from the stream. */ if (drr->drr_payloadlen != 0) { error = recv_read_nvlist(hdl, fd, drr->drr_payloadlen, &stream_nv, flags->byteswap, zc); if (error) { error = zfs_error(hdl, EZFS_BADSTREAM, errbuf); goto out; } } recursive = (nvlist_lookup_boolean(stream_nv, "not_recursive") == ENOENT); raw = (nvlist_lookup_boolean(stream_nv, "raw") == 0); if (recursive && strchr(destname, '@')) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot specify snapshot name for multi-snapshot stream")); error = zfs_error(hdl, EZFS_BADSTREAM, errbuf); goto out; } /* * Read in the end record and verify checksum. */ if (0 != (error = recv_read(hdl, fd, &drre, sizeof (drre), flags->byteswap, NULL))) goto out; if (flags->byteswap) { drre.drr_type = BSWAP_32(drre.drr_type); drre.drr_u.drr_end.drr_checksum.zc_word[0] = BSWAP_64(drre.drr_u.drr_end.drr_checksum.zc_word[0]); drre.drr_u.drr_end.drr_checksum.zc_word[1] = BSWAP_64(drre.drr_u.drr_end.drr_checksum.zc_word[1]); drre.drr_u.drr_end.drr_checksum.zc_word[2] = BSWAP_64(drre.drr_u.drr_end.drr_checksum.zc_word[2]); drre.drr_u.drr_end.drr_checksum.zc_word[3] = BSWAP_64(drre.drr_u.drr_end.drr_checksum.zc_word[3]); } if (drre.drr_type != DRR_END) { error = zfs_error(hdl, EZFS_BADSTREAM, errbuf); goto out; } if (!ZIO_CHECKSUM_EQUAL(drre.drr_u.drr_end.drr_checksum, *zc)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "incorrect header checksum")); error = zfs_error(hdl, EZFS_BADSTREAM, errbuf); goto out; } (void) nvlist_lookup_string(stream_nv, "fromsnap", &fromsnap); if (drr->drr_payloadlen != 0) { nvlist_t *stream_fss; stream_fss = fnvlist_lookup_nvlist(stream_nv, "fss"); if ((stream_avl = fsavl_create(stream_fss)) == NULL) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "couldn't allocate avl tree")); error = zfs_error(hdl, EZFS_NOMEM, errbuf); goto out; } if (fromsnap != NULL && recursive) { nvlist_t *renamed = NULL; nvpair_t *pair = NULL; (void) strlcpy(tofs, destname, sizeof (tofs)); if (flags->isprefix) { struct drr_begin *drrb = &drr->drr_u.drr_begin; int i; if (flags->istail) { cp = strrchr(drrb->drr_toname, '/'); if (cp == NULL) { (void) strlcat(tofs, "/", sizeof (tofs)); i = 0; } else { i = (cp - drrb->drr_toname); } } else { i = strcspn(drrb->drr_toname, "/@"); } /* zfs_receive_one() will create_parents() */ (void) strlcat(tofs, &drrb->drr_toname[i], sizeof (tofs)); *strchr(tofs, '@') = '\0'; } if (!flags->dryrun && !flags->nomount) { renamed = fnvlist_alloc(); } softerr = recv_incremental_replication(hdl, tofs, flags, stream_nv, stream_avl, renamed); /* Unmount renamed filesystems before receiving. */ while ((pair = nvlist_next_nvpair(renamed, pair)) != NULL) { zfs_handle_t *zhp; prop_changelist_t *clp = NULL; zhp = zfs_open(hdl, nvpair_name(pair), ZFS_TYPE_FILESYSTEM); if (zhp != NULL) { clp = changelist_gather(zhp, ZFS_PROP_MOUNTPOINT, 0, flags->forceunmount ? MS_FORCE : 0); zfs_close(zhp); if (clp != NULL) { softerr |= changelist_prefix(clp); changelist_free(clp); } } } fnvlist_free(renamed); } } /* * Get the fs specified by the first path in the stream (the top level * specified by 'zfs send') and pass it to each invocation of * zfs_receive_one(). */ (void) strlcpy(sendfs, drr->drr_u.drr_begin.drr_toname, sizeof (sendfs)); if ((cp = strchr(sendfs, '@')) != NULL) { *cp = '\0'; /* * Find the "sendsnap", the final snapshot in a replication * stream. zfs_receive_one() handles certain errors * differently, depending on if the contained stream is the * last one or not. */ sendsnap = (cp + 1); } /* Finally, receive each contained stream */ do { /* * we should figure out if it has a recoverable * error, in which case do a recv_skip() and drive on. * Note, if we fail due to already having this guid, * zfs_receive_one() will take care of it (ie, * recv_skip() and return 0). */ error = zfs_receive_impl(hdl, destname, NULL, flags, fd, sendfs, stream_nv, stream_avl, top_zfs, sendsnap, cmdprops); if (error == ENODATA) { error = 0; break; } anyerr |= error; } while (error == 0); if (drr->drr_payloadlen != 0 && recursive && fromsnap != NULL) { /* * Now that we have the fs's they sent us, try the * renames again. */ softerr = recv_incremental_replication(hdl, tofs, flags, stream_nv, stream_avl, NULL); } if (raw && softerr == 0 && *top_zfs != NULL) { softerr = recv_fix_encryption_hierarchy(hdl, *top_zfs, stream_nv); } out: fsavl_destroy(stream_avl); fnvlist_free(stream_nv); if (softerr) error = -2; if (anyerr) error = -1; return (error); } static void trunc_prop_errs(int truncated) { ASSERT(truncated != 0); if (truncated == 1) (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "1 more property could not be set\n")); else (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "%d more properties could not be set\n"), truncated); } static int recv_skip(libzfs_handle_t *hdl, int fd, boolean_t byteswap) { dmu_replay_record_t *drr; void *buf = zfs_alloc(hdl, SPA_MAXBLOCKSIZE); uint64_t payload_size; char errbuf[ERRBUFLEN]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot receive")); /* XXX would be great to use lseek if possible... */ drr = buf; while (recv_read(hdl, fd, drr, sizeof (dmu_replay_record_t), byteswap, NULL) == 0) { if (byteswap) drr->drr_type = BSWAP_32(drr->drr_type); switch (drr->drr_type) { case DRR_BEGIN: if (drr->drr_payloadlen != 0) { (void) recv_read(hdl, fd, buf, drr->drr_payloadlen, B_FALSE, NULL); } break; case DRR_END: free(buf); return (0); case DRR_OBJECT: if (byteswap) { drr->drr_u.drr_object.drr_bonuslen = BSWAP_32(drr->drr_u.drr_object. drr_bonuslen); drr->drr_u.drr_object.drr_raw_bonuslen = BSWAP_32(drr->drr_u.drr_object. drr_raw_bonuslen); } payload_size = DRR_OBJECT_PAYLOAD_SIZE(&drr->drr_u.drr_object); (void) recv_read(hdl, fd, buf, payload_size, B_FALSE, NULL); break; case DRR_WRITE: if (byteswap) { drr->drr_u.drr_write.drr_logical_size = BSWAP_64( drr->drr_u.drr_write.drr_logical_size); drr->drr_u.drr_write.drr_compressed_size = BSWAP_64( drr->drr_u.drr_write.drr_compressed_size); } payload_size = DRR_WRITE_PAYLOAD_SIZE(&drr->drr_u.drr_write); assert(payload_size <= SPA_MAXBLOCKSIZE); (void) recv_read(hdl, fd, buf, payload_size, B_FALSE, NULL); break; case DRR_SPILL: if (byteswap) { drr->drr_u.drr_spill.drr_length = BSWAP_64(drr->drr_u.drr_spill.drr_length); drr->drr_u.drr_spill.drr_compressed_size = BSWAP_64(drr->drr_u.drr_spill. drr_compressed_size); } payload_size = DRR_SPILL_PAYLOAD_SIZE(&drr->drr_u.drr_spill); (void) recv_read(hdl, fd, buf, payload_size, B_FALSE, NULL); break; case DRR_WRITE_EMBEDDED: if (byteswap) { drr->drr_u.drr_write_embedded.drr_psize = BSWAP_32(drr->drr_u.drr_write_embedded. drr_psize); } (void) recv_read(hdl, fd, buf, P2ROUNDUP(drr->drr_u.drr_write_embedded.drr_psize, 8), B_FALSE, NULL); break; case DRR_OBJECT_RANGE: case DRR_WRITE_BYREF: case DRR_FREEOBJECTS: case DRR_FREE: break; default: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid record type")); free(buf); return (zfs_error(hdl, EZFS_BADSTREAM, errbuf)); } } free(buf); return (-1); } static void recv_ecksum_set_aux(libzfs_handle_t *hdl, const char *target_snap, boolean_t resumable, boolean_t checksum) { char target_fs[ZFS_MAX_DATASET_NAME_LEN]; zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, (checksum ? "checksum mismatch" : "incomplete stream"))); if (!resumable) return; (void) strlcpy(target_fs, target_snap, sizeof (target_fs)); *strchr(target_fs, '@') = '\0'; zfs_handle_t *zhp = zfs_open(hdl, target_fs, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); if (zhp == NULL) return; char token_buf[ZFS_MAXPROPLEN]; int error = zfs_prop_get(zhp, ZFS_PROP_RECEIVE_RESUME_TOKEN, token_buf, sizeof (token_buf), NULL, NULL, 0, B_TRUE); if (error == 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "checksum mismatch or incomplete stream.\n" "Partially received snapshot is saved.\n" "A resuming stream can be generated on the sending " "system by running:\n" " zfs send -t %s"), token_buf); } zfs_close(zhp); } /* * Prepare a new nvlist of properties that are to override (-o) or be excluded * (-x) from the received dataset * recvprops: received properties from the send stream * cmdprops: raw input properties from command line * origprops: properties, both locally-set and received, currently set on the * target dataset if it exists, NULL otherwise. * oxprops: valid output override (-o) and excluded (-x) properties */ static int zfs_setup_cmdline_props(libzfs_handle_t *hdl, zfs_type_t type, char *fsname, boolean_t zoned, boolean_t recursive, boolean_t newfs, boolean_t raw, boolean_t toplevel, nvlist_t *recvprops, nvlist_t *cmdprops, nvlist_t *origprops, nvlist_t **oxprops, uint8_t **wkeydata_out, uint_t *wkeylen_out, const char *errbuf) { nvpair_t *nvp; nvlist_t *oprops, *voprops; zfs_handle_t *zhp = NULL; zpool_handle_t *zpool_hdl = NULL; char *cp; int ret = 0; char namebuf[ZFS_MAX_DATASET_NAME_LEN]; if (nvlist_empty(cmdprops)) return (0); /* No properties to override or exclude */ *oxprops = fnvlist_alloc(); oprops = fnvlist_alloc(); strlcpy(namebuf, fsname, ZFS_MAX_DATASET_NAME_LEN); /* * Get our dataset handle. The target dataset may not exist yet. */ if (zfs_dataset_exists(hdl, namebuf, ZFS_TYPE_DATASET)) { zhp = zfs_open(hdl, namebuf, ZFS_TYPE_DATASET); if (zhp == NULL) { ret = -1; goto error; } } /* open the zpool handle */ cp = strchr(namebuf, '/'); if (cp != NULL) *cp = '\0'; zpool_hdl = zpool_open(hdl, namebuf); if (zpool_hdl == NULL) { ret = -1; goto error; } /* restore namebuf to match fsname for later use */ if (cp != NULL) *cp = '/'; /* * first iteration: process excluded (-x) properties now and gather * added (-o) properties to be later processed by zfs_valid_proplist() */ nvp = NULL; while ((nvp = nvlist_next_nvpair(cmdprops, nvp)) != NULL) { const char *name = nvpair_name(nvp); zfs_prop_t prop = zfs_name_to_prop(name); /* * It turns out, if we don't normalize "aliased" names * e.g. compress= against the "real" names (e.g. compression) * here, then setting/excluding them does not work as * intended. * * But since user-defined properties wouldn't have a valid * mapping here, we do this conditional dance. */ const char *newname = name; if (prop >= ZFS_PROP_TYPE) newname = zfs_prop_to_name(prop); /* "origin" is processed separately, don't handle it here */ if (prop == ZFS_PROP_ORIGIN) continue; /* raw streams can't override encryption properties */ if ((zfs_prop_encryption_key_param(prop) || prop == ZFS_PROP_ENCRYPTION) && raw) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "encryption property '%s' cannot " "be set or excluded for raw streams."), name); ret = zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } /* * For plain replicated send, we can ignore encryption * properties other than first stream */ if ((zfs_prop_encryption_key_param(prop) || prop == ZFS_PROP_ENCRYPTION) && !newfs && recursive && !raw) { continue; } /* incremental streams can only exclude encryption properties */ if ((zfs_prop_encryption_key_param(prop) || prop == ZFS_PROP_ENCRYPTION) && !newfs && nvpair_type(nvp) != DATA_TYPE_BOOLEAN) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "encryption property '%s' cannot " "be set for incremental streams."), name); ret = zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } switch (nvpair_type(nvp)) { case DATA_TYPE_BOOLEAN: /* -x property */ /* * DATA_TYPE_BOOLEAN is the way we're asked to "exclude" * a property: this is done by forcing an explicit * inherit on the destination so the effective value is * not the one we received from the send stream. */ if (!zfs_prop_valid_for_type(prop, type, B_FALSE) && !zfs_prop_user(name)) { (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "Warning: %s: property '%s' does not " "apply to datasets of this type\n"), fsname, name); continue; } /* * We do this only if the property is not already * locally-set, in which case its value will take * priority over the received anyway. */ if (nvlist_exists(origprops, newname)) { nvlist_t *attrs; char *source = NULL; attrs = fnvlist_lookup_nvlist(origprops, newname); if (nvlist_lookup_string(attrs, ZPROP_SOURCE, &source) == 0 && strcmp(source, ZPROP_SOURCE_VAL_RECVD) != 0) continue; } /* * We can't force an explicit inherit on non-inheritable * properties: if we're asked to exclude this kind of * values we remove them from "recvprops" input nvlist. */ if (!zfs_prop_user(name) && /* can be inherited too */ !zfs_prop_inheritable(prop) && nvlist_exists(recvprops, newname)) fnvlist_remove(recvprops, newname); else fnvlist_add_boolean(*oxprops, newname); break; case DATA_TYPE_STRING: /* -o property=value */ /* * we're trying to override a property that does not * make sense for this type of dataset, but we don't * want to fail if the receive is recursive: this comes * in handy when the send stream contains, for * instance, a child ZVOL and we're trying to receive * it with "-o atime=on" */ if (!zfs_prop_valid_for_type(prop, type, B_FALSE) && !zfs_prop_user(name)) { if (recursive) continue; zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "property '%s' does not apply to datasets " "of this type"), name); ret = zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } fnvlist_add_string(oprops, newname, fnvpair_value_string(nvp)); break; default: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "property '%s' must be a string or boolean"), name); ret = zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } } if (toplevel) { /* convert override strings properties to native */ if ((voprops = zfs_valid_proplist(hdl, ZFS_TYPE_DATASET, oprops, zoned, zhp, zpool_hdl, B_FALSE, errbuf)) == NULL) { ret = zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } /* * zfs_crypto_create() requires the parent name. Get it * by truncating the fsname copy stored in namebuf. */ cp = strrchr(namebuf, '/'); if (cp != NULL) *cp = '\0'; if (!raw && !(!newfs && recursive) && zfs_crypto_create(hdl, namebuf, voprops, NULL, B_FALSE, wkeydata_out, wkeylen_out) != 0) { fnvlist_free(voprops); ret = zfs_error(hdl, EZFS_CRYPTOFAILED, errbuf); goto error; } /* second pass: process "-o" properties */ fnvlist_merge(*oxprops, voprops); fnvlist_free(voprops); } else { /* override props on child dataset are inherited */ nvp = NULL; while ((nvp = nvlist_next_nvpair(oprops, nvp)) != NULL) { const char *name = nvpair_name(nvp); fnvlist_add_boolean(*oxprops, name); } } error: if (zhp != NULL) zfs_close(zhp); if (zpool_hdl != NULL) zpool_close(zpool_hdl); fnvlist_free(oprops); return (ret); } /* * Restores a backup of tosnap from the file descriptor specified by infd. */ static int zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, const char *originsnap, recvflags_t *flags, dmu_replay_record_t *drr, dmu_replay_record_t *drr_noswap, const char *sendfs, nvlist_t *stream_nv, avl_tree_t *stream_avl, char **top_zfs, const char *finalsnap, nvlist_t *cmdprops) { struct timespec begin_time; int ioctl_err, ioctl_errno, err; char *cp; struct drr_begin *drrb = &drr->drr_u.drr_begin; char errbuf[ERRBUFLEN]; const char *chopprefix; boolean_t newfs = B_FALSE; boolean_t stream_wantsnewfs, stream_resumingnewfs; boolean_t newprops = B_FALSE; uint64_t read_bytes = 0; uint64_t errflags = 0; uint64_t parent_snapguid = 0; prop_changelist_t *clp = NULL; nvlist_t *snapprops_nvlist = NULL; nvlist_t *snapholds_nvlist = NULL; zprop_errflags_t prop_errflags; nvlist_t *prop_errors = NULL; boolean_t recursive; char *snapname = NULL; char destsnap[MAXPATHLEN * 2]; char origin[MAXNAMELEN] = {0}; char name[MAXPATHLEN]; char tmp_keylocation[MAXNAMELEN] = {0}; nvlist_t *rcvprops = NULL; /* props received from the send stream */ nvlist_t *oxprops = NULL; /* override (-o) and exclude (-x) props */ nvlist_t *origprops = NULL; /* original props (if destination exists) */ zfs_type_t type = ZFS_TYPE_INVALID; boolean_t toplevel = B_FALSE; boolean_t zoned = B_FALSE; boolean_t hastoken = B_FALSE; boolean_t redacted; uint8_t *wkeydata = NULL; uint_t wkeylen = 0; #ifndef CLOCK_MONOTONIC_RAW #define CLOCK_MONOTONIC_RAW CLOCK_MONOTONIC #endif clock_gettime(CLOCK_MONOTONIC_RAW, &begin_time); (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot receive")); recursive = (nvlist_lookup_boolean(stream_nv, "not_recursive") == ENOENT); /* Did the user request holds be skipped via zfs recv -k? */ boolean_t holds = flags->holds && !flags->skipholds; if (stream_avl != NULL) { char *keylocation = NULL; nvlist_t *lookup = NULL; nvlist_t *fs = fsavl_find(stream_avl, drrb->drr_toguid, &snapname); (void) nvlist_lookup_uint64(fs, "parentfromsnap", &parent_snapguid); err = nvlist_lookup_nvlist(fs, "props", &rcvprops); if (err) { rcvprops = fnvlist_alloc(); newprops = B_TRUE; } /* * The keylocation property may only be set on encryption roots, * but this dataset might not become an encryption root until * recv_fix_encryption_hierarchy() is called. That function * will fixup the keylocation anyway, so we temporarily unset * the keylocation for now to avoid any errors from the receive * ioctl. */ err = nvlist_lookup_string(rcvprops, zfs_prop_to_name(ZFS_PROP_KEYLOCATION), &keylocation); if (err == 0) { strlcpy(tmp_keylocation, keylocation, MAXNAMELEN); (void) nvlist_remove_all(rcvprops, zfs_prop_to_name(ZFS_PROP_KEYLOCATION)); } if (flags->canmountoff) { fnvlist_add_uint64(rcvprops, zfs_prop_to_name(ZFS_PROP_CANMOUNT), 0); } else if (newprops) { /* nothing in rcvprops, eliminate it */ fnvlist_free(rcvprops); rcvprops = NULL; newprops = B_FALSE; } if (0 == nvlist_lookup_nvlist(fs, "snapprops", &lookup)) { snapprops_nvlist = fnvlist_lookup_nvlist(lookup, snapname); } if (holds) { if (0 == nvlist_lookup_nvlist(fs, "snapholds", &lookup)) { snapholds_nvlist = fnvlist_lookup_nvlist( lookup, snapname); } } } cp = NULL; /* * Determine how much of the snapshot name stored in the stream * we are going to tack on to the name they specified on the * command line, and how much we are going to chop off. * * If they specified a snapshot, chop the entire name stored in * the stream. */ if (flags->istail) { /* * A filesystem was specified with -e. We want to tack on only * the tail of the sent snapshot path. */ if (strchr(tosnap, '@')) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid " "argument - snapshot not allowed with -e")); err = zfs_error(hdl, EZFS_INVALIDNAME, errbuf); goto out; } chopprefix = strrchr(sendfs, '/'); if (chopprefix == NULL) { /* * The tail is the poolname, so we need to * prepend a path separator. */ int len = strlen(drrb->drr_toname); cp = umem_alloc(len + 2, UMEM_NOFAIL); cp[0] = '/'; (void) strcpy(&cp[1], drrb->drr_toname); chopprefix = cp; } else { chopprefix = drrb->drr_toname + (chopprefix - sendfs); } } else if (flags->isprefix) { /* * A filesystem was specified with -d. We want to tack on * everything but the first element of the sent snapshot path * (all but the pool name). */ if (strchr(tosnap, '@')) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid " "argument - snapshot not allowed with -d")); err = zfs_error(hdl, EZFS_INVALIDNAME, errbuf); goto out; } chopprefix = strchr(drrb->drr_toname, '/'); if (chopprefix == NULL) chopprefix = strchr(drrb->drr_toname, '@'); } else if (strchr(tosnap, '@') == NULL) { /* * If a filesystem was specified without -d or -e, we want to * tack on everything after the fs specified by 'zfs send'. */ chopprefix = drrb->drr_toname + strlen(sendfs); } else { /* A snapshot was specified as an exact path (no -d or -e). */ if (recursive) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot specify snapshot name for multi-snapshot " "stream")); err = zfs_error(hdl, EZFS_BADSTREAM, errbuf); goto out; } chopprefix = drrb->drr_toname + strlen(drrb->drr_toname); } ASSERT(strstr(drrb->drr_toname, sendfs) == drrb->drr_toname); ASSERT(chopprefix > drrb->drr_toname || strchr(sendfs, '/') == NULL); ASSERT(chopprefix <= drrb->drr_toname + strlen(drrb->drr_toname) || strchr(sendfs, '/') == NULL); ASSERT(chopprefix[0] == '/' || chopprefix[0] == '@' || chopprefix[0] == '\0'); /* * Determine name of destination snapshot. */ (void) strlcpy(destsnap, tosnap, sizeof (destsnap)); (void) strlcat(destsnap, chopprefix, sizeof (destsnap)); if (cp != NULL) umem_free(cp, strlen(cp) + 1); if (!zfs_name_valid(destsnap, ZFS_TYPE_SNAPSHOT)) { err = zfs_error(hdl, EZFS_INVALIDNAME, errbuf); goto out; } /* * Determine the name of the origin snapshot. */ if (originsnap) { (void) strlcpy(origin, originsnap, sizeof (origin)); if (flags->verbose) (void) printf("using provided clone origin %s\n", origin); } else if (drrb->drr_flags & DRR_FLAG_CLONE) { if (guid_to_name(hdl, destsnap, drrb->drr_fromguid, B_FALSE, origin) != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "local origin for clone %s does not exist"), destsnap); err = zfs_error(hdl, EZFS_NOENT, errbuf); goto out; } if (flags->verbose) (void) printf("found clone origin %s\n", origin); } if ((DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) & DMU_BACKUP_FEATURE_DEDUP)) { (void) fprintf(stderr, gettext("ERROR: \"zfs receive\" no longer supports " "deduplicated send streams. Use\n" "the \"zstream redup\" command to convert this stream " "to a regular,\n" "non-deduplicated stream.\n")); err = zfs_error(hdl, EZFS_NOTSUP, errbuf); goto out; } boolean_t resuming = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) & DMU_BACKUP_FEATURE_RESUMING; boolean_t raw = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) & DMU_BACKUP_FEATURE_RAW; boolean_t embedded = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) & DMU_BACKUP_FEATURE_EMBED_DATA; stream_wantsnewfs = (drrb->drr_fromguid == 0 || (drrb->drr_flags & DRR_FLAG_CLONE) || originsnap) && !resuming; stream_resumingnewfs = (drrb->drr_fromguid == 0 || (drrb->drr_flags & DRR_FLAG_CLONE) || originsnap) && resuming; if (stream_wantsnewfs) { /* * if the parent fs does not exist, look for it based on * the parent snap GUID */ (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot receive new filesystem stream")); (void) strlcpy(name, destsnap, sizeof (name)); cp = strrchr(name, '/'); if (cp) *cp = '\0'; if (cp && !zfs_dataset_exists(hdl, name, ZFS_TYPE_DATASET)) { char suffix[ZFS_MAX_DATASET_NAME_LEN]; (void) strlcpy(suffix, strrchr(destsnap, '/'), sizeof (suffix)); if (guid_to_name(hdl, name, parent_snapguid, B_FALSE, destsnap) == 0) { *strchr(destsnap, '@') = '\0'; (void) strlcat(destsnap, suffix, sizeof (destsnap) - strlen(destsnap)); } } } else { /* * If the fs does not exist, look for it based on the * fromsnap GUID. */ if (resuming) { (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot receive resume stream")); } else { (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot receive incremental stream")); } (void) strlcpy(name, destsnap, sizeof (name)); *strchr(name, '@') = '\0'; /* * If the exact receive path was specified and this is the * topmost path in the stream, then if the fs does not exist we * should look no further. */ if ((flags->isprefix || (*(chopprefix = drrb->drr_toname + strlen(sendfs)) != '\0' && *chopprefix != '@')) && !zfs_dataset_exists(hdl, name, ZFS_TYPE_DATASET)) { char snap[ZFS_MAX_DATASET_NAME_LEN]; (void) strlcpy(snap, strchr(destsnap, '@'), sizeof (snap)); if (guid_to_name(hdl, name, drrb->drr_fromguid, B_FALSE, destsnap) == 0) { *strchr(destsnap, '@') = '\0'; (void) strlcat(destsnap, snap, sizeof (destsnap) - strlen(destsnap)); } } } (void) strlcpy(name, destsnap, sizeof (name)); *strchr(name, '@') = '\0'; redacted = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) & DMU_BACKUP_FEATURE_REDACTED; if (flags->heal) { if (flags->isprefix || flags->istail || flags->force || flags->canmountoff || flags->resumable || flags->nomount || flags->skipholds) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "corrective recv can not be used when combined with" " this flag")); err = zfs_error(hdl, EZFS_INVALIDNAME, errbuf); goto out; } uint64_t guid = get_snap_guid(hdl, name, strchr(destsnap, '@') + 1); if (guid == 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "corrective recv must specify an existing snapshot" " to heal")); err = zfs_error(hdl, EZFS_INVALIDNAME, errbuf); goto out; } else if (guid != drrb->drr_toguid) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "local snapshot doesn't match the snapshot" " in the provided stream")); err = zfs_error(hdl, EZFS_WRONG_PARENT, errbuf); goto out; } } else if (zfs_dataset_exists(hdl, name, ZFS_TYPE_DATASET)) { zfs_cmd_t zc = {"\0"}; zfs_handle_t *zhp = NULL; boolean_t encrypted; (void) strcpy(zc.zc_name, name); /* * Destination fs exists. It must be one of these cases: * - an incremental send stream * - the stream specifies a new fs (full stream or clone) * and they want us to blow away the existing fs (and * have therefore specified -F and removed any snapshots) * - we are resuming a failed receive. */ if (stream_wantsnewfs) { boolean_t is_volume = drrb->drr_type == DMU_OST_ZVOL; if (!flags->force) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "destination '%s' exists\n" "must specify -F to overwrite it"), name); err = zfs_error(hdl, EZFS_EXISTS, errbuf); goto out; } if (zfs_ioctl(hdl, ZFS_IOC_SNAPSHOT_LIST_NEXT, &zc) == 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "destination has snapshots (eg. %s)\n" "must destroy them to overwrite it"), zc.zc_name); err = zfs_error(hdl, EZFS_EXISTS, errbuf); goto out; } if (is_volume && strrchr(name, '/') == NULL) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "destination %s is the root dataset\n" "cannot overwrite with a ZVOL"), name); err = zfs_error(hdl, EZFS_EXISTS, errbuf); goto out; } if (is_volume && zfs_ioctl(hdl, ZFS_IOC_DATASET_LIST_NEXT, &zc) == 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "destination has children (eg. %s)\n" "cannot overwrite with a ZVOL"), zc.zc_name); err = zfs_error(hdl, EZFS_WRONG_PARENT, errbuf); goto out; } } if ((zhp = zfs_open(hdl, name, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME)) == NULL) { err = -1; goto out; } /* * When receiving full/newfs on existing dataset, then it * should be done with "-F" flag. Its enforced for initial * receive in previous checks in this function. * Similarly, on resuming full/newfs recv on existing dataset, * it should be done with "-F" flag. * * When dataset doesn't exist, then full/newfs recv is done on * newly created dataset and it's marked INCONSISTENT. But * When receiving on existing dataset, recv is first done on * %recv and its marked INCONSISTENT. Existing dataset is not * marked INCONSISTENT. * Resume of full/newfs receive with dataset not INCONSISTENT * indicates that its resuming newfs on existing dataset. So, * enforce "-F" flag in this case. */ if (stream_resumingnewfs && !zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT) && !flags->force) { zfs_close(zhp); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Resuming recv on existing destination '%s'\n" "must specify -F to overwrite it"), name); err = zfs_error(hdl, EZFS_RESUME_EXISTS, errbuf); goto out; } if (stream_wantsnewfs && zhp->zfs_dmustats.dds_origin[0]) { zfs_close(zhp); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "destination '%s' is a clone\n" "must destroy it to overwrite it"), name); err = zfs_error(hdl, EZFS_EXISTS, errbuf); goto out; } /* * Raw sends can not be performed as an incremental on top * of existing unencrypted datasets. zfs recv -F can't be * used to blow away an existing encrypted filesystem. This * is because it would require the dsl dir to point to the * new key (or lack of a key) and the old key at the same * time. The -F flag may still be used for deleting * intermediate snapshots that would otherwise prevent the * receive from working. */ encrypted = zfs_prop_get_int(zhp, ZFS_PROP_ENCRYPTION) != ZIO_CRYPT_OFF; if (!stream_wantsnewfs && !encrypted && raw) { zfs_close(zhp); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot perform raw receive on top of " "existing unencrypted dataset")); err = zfs_error(hdl, EZFS_BADRESTORE, errbuf); goto out; } if (stream_wantsnewfs && flags->force && ((raw && !encrypted) || encrypted)) { zfs_close(zhp); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "zfs receive -F cannot be used to destroy an " "encrypted filesystem or overwrite an " "unencrypted one with an encrypted one")); err = zfs_error(hdl, EZFS_BADRESTORE, errbuf); goto out; } if (!flags->dryrun && zhp->zfs_type == ZFS_TYPE_FILESYSTEM && (stream_wantsnewfs || stream_resumingnewfs)) { /* We can't do online recv in this case */ clp = changelist_gather(zhp, ZFS_PROP_NAME, 0, flags->forceunmount ? MS_FORCE : 0); if (clp == NULL) { zfs_close(zhp); err = -1; goto out; } if (changelist_prefix(clp) != 0) { changelist_free(clp); zfs_close(zhp); err = -1; goto out; } } /* * If we are resuming a newfs, set newfs here so that we will * mount it if the recv succeeds this time. We can tell * that it was a newfs on the first recv because the fs * itself will be inconsistent (if the fs existed when we * did the first recv, we would have received it into * .../%recv). */ if (resuming && zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT)) newfs = B_TRUE; /* we want to know if we're zoned when validating -o|-x props */ zoned = zfs_prop_get_int(zhp, ZFS_PROP_ZONED); /* may need this info later, get it now we have zhp around */ if (zfs_prop_get(zhp, ZFS_PROP_RECEIVE_RESUME_TOKEN, NULL, 0, NULL, NULL, 0, B_TRUE) == 0) hastoken = B_TRUE; /* gather existing properties on destination */ origprops = fnvlist_alloc(); fnvlist_merge(origprops, zhp->zfs_props); fnvlist_merge(origprops, zhp->zfs_user_props); zfs_close(zhp); } else { zfs_handle_t *zhp; /* * Destination filesystem does not exist. Therefore we better * be creating a new filesystem (either from a full backup, or * a clone). It would therefore be invalid if the user * specified only the pool name (i.e. if the destination name * contained no slash character). */ cp = strrchr(name, '/'); if (!stream_wantsnewfs || cp == NULL) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "destination '%s' does not exist"), name); err = zfs_error(hdl, EZFS_NOENT, errbuf); goto out; } /* * Trim off the final dataset component so we perform the * recvbackup ioctl to the filesystems's parent. */ *cp = '\0'; if (flags->isprefix && !flags->istail && !flags->dryrun && create_parents(hdl, destsnap, strlen(tosnap)) != 0) { err = zfs_error(hdl, EZFS_BADRESTORE, errbuf); goto out; } /* validate parent */ zhp = zfs_open(hdl, name, ZFS_TYPE_DATASET); if (zhp == NULL) { err = zfs_error(hdl, EZFS_BADRESTORE, errbuf); goto out; } if (zfs_get_type(zhp) != ZFS_TYPE_FILESYSTEM) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "parent '%s' is not a filesystem"), name); err = zfs_error(hdl, EZFS_WRONG_PARENT, errbuf); zfs_close(zhp); goto out; } zfs_close(zhp); newfs = B_TRUE; *cp = '/'; } if (flags->verbose) { (void) printf("%s %s%s stream of %s into %s\n", flags->dryrun ? "would receive" : "receiving", flags->heal ? " corrective" : "", drrb->drr_fromguid ? "incremental" : "full", drrb->drr_toname, destsnap); (void) fflush(stdout); } /* * If this is the top-level dataset, record it so we can use it * for recursive operations later. */ if (top_zfs != NULL && (*top_zfs == NULL || strcmp(*top_zfs, name) == 0)) { toplevel = B_TRUE; if (*top_zfs == NULL) *top_zfs = zfs_strdup(hdl, name); } if (drrb->drr_type == DMU_OST_ZVOL) { type = ZFS_TYPE_VOLUME; } else if (drrb->drr_type == DMU_OST_ZFS) { type = ZFS_TYPE_FILESYSTEM; } else { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid record type: 0x%d"), drrb->drr_type); err = zfs_error(hdl, EZFS_BADSTREAM, errbuf); goto out; } if ((err = zfs_setup_cmdline_props(hdl, type, name, zoned, recursive, stream_wantsnewfs, raw, toplevel, rcvprops, cmdprops, origprops, &oxprops, &wkeydata, &wkeylen, errbuf)) != 0) goto out; /* * When sending with properties (zfs send -p), the encryption property * is not included because it is a SETONCE property and therefore * treated as read only. However, we are always able to determine its * value because raw sends will include it in the DRR_BDEGIN payload * and non-raw sends with properties are not allowed for encrypted * datasets. Therefore, if this is a non-raw properties stream, we can * infer that the value should be ZIO_CRYPT_OFF and manually add that * to the received properties. */ if (stream_wantsnewfs && !raw && rcvprops != NULL && !nvlist_exists(cmdprops, zfs_prop_to_name(ZFS_PROP_ENCRYPTION))) { if (oxprops == NULL) oxprops = fnvlist_alloc(); fnvlist_add_uint64(oxprops, zfs_prop_to_name(ZFS_PROP_ENCRYPTION), ZIO_CRYPT_OFF); } if (flags->dryrun) { void *buf = zfs_alloc(hdl, SPA_MAXBLOCKSIZE); /* * We have read the DRR_BEGIN record, but we have * not yet read the payload. For non-dryrun sends * this will be done by the kernel, so we must * emulate that here, before attempting to read * more records. */ err = recv_read(hdl, infd, buf, drr->drr_payloadlen, flags->byteswap, NULL); free(buf); if (err != 0) goto out; err = recv_skip(hdl, infd, flags->byteswap); goto out; } if (flags->heal) { err = ioctl_err = lzc_receive_with_heal(destsnap, rcvprops, oxprops, wkeydata, wkeylen, origin, flags->force, flags->heal, flags->resumable, raw, infd, drr_noswap, -1, &read_bytes, &errflags, NULL, &prop_errors); } else { err = ioctl_err = lzc_receive_with_cmdprops(destsnap, rcvprops, oxprops, wkeydata, wkeylen, origin, flags->force, flags->resumable, raw, infd, drr_noswap, -1, &read_bytes, &errflags, NULL, &prop_errors); } ioctl_errno = ioctl_err; prop_errflags = errflags; if (err == 0) { nvpair_t *prop_err = NULL; while ((prop_err = nvlist_next_nvpair(prop_errors, prop_err)) != NULL) { char tbuf[1024]; zfs_prop_t prop; int intval; prop = zfs_name_to_prop(nvpair_name(prop_err)); (void) nvpair_value_int32(prop_err, &intval); if (strcmp(nvpair_name(prop_err), ZPROP_N_MORE_ERRORS) == 0) { trunc_prop_errs(intval); break; } else if (snapname == NULL || finalsnap == NULL || strcmp(finalsnap, snapname) == 0 || strcmp(nvpair_name(prop_err), zfs_prop_to_name(ZFS_PROP_REFQUOTA)) != 0) { /* * Skip the special case of, for example, * "refquota", errors on intermediate * snapshots leading up to a final one. * That's why we have all of the checks above. * * See zfs_ioctl.c's extract_delay_props() for * a list of props which can fail on * intermediate snapshots, but shouldn't * affect the overall receive. */ (void) snprintf(tbuf, sizeof (tbuf), dgettext(TEXT_DOMAIN, "cannot receive %s property on %s"), nvpair_name(prop_err), name); zfs_setprop_error(hdl, prop, intval, tbuf); } } } if (err == 0 && snapprops_nvlist) { zfs_cmd_t zc = {"\0"}; (void) strlcpy(zc.zc_name, destsnap, sizeof (zc.zc_name)); zc.zc_cookie = B_TRUE; /* received */ zcmd_write_src_nvlist(hdl, &zc, snapprops_nvlist); (void) zfs_ioctl(hdl, ZFS_IOC_SET_PROP, &zc); zcmd_free_nvlists(&zc); } if (err == 0 && snapholds_nvlist) { nvpair_t *pair; nvlist_t *holds, *errors = NULL; int cleanup_fd = -1; VERIFY(0 == nvlist_alloc(&holds, 0, KM_SLEEP)); for (pair = nvlist_next_nvpair(snapholds_nvlist, NULL); pair != NULL; pair = nvlist_next_nvpair(snapholds_nvlist, pair)) { fnvlist_add_string(holds, destsnap, nvpair_name(pair)); } (void) lzc_hold(holds, cleanup_fd, &errors); fnvlist_free(snapholds_nvlist); fnvlist_free(holds); } if (err && (ioctl_errno == ENOENT || ioctl_errno == EEXIST)) { /* * It may be that this snapshot already exists, * in which case we want to consume & ignore it * rather than failing. */ avl_tree_t *local_avl; nvlist_t *local_nv, *fs; cp = strchr(destsnap, '@'); /* * XXX Do this faster by just iterating over snaps in * this fs. Also if zc_value does not exist, we will * get a strange "does not exist" error message. */ *cp = '\0'; if (gather_nvlist(hdl, destsnap, NULL, NULL, B_FALSE, B_TRUE, B_FALSE, B_FALSE, B_FALSE, B_FALSE, B_FALSE, B_FALSE, B_TRUE, &local_nv, &local_avl) == 0) { *cp = '@'; fs = fsavl_find(local_avl, drrb->drr_toguid, NULL); fsavl_destroy(local_avl); fnvlist_free(local_nv); if (fs != NULL) { if (flags->verbose) { (void) printf("snap %s already exists; " "ignoring\n", destsnap); } err = ioctl_err = recv_skip(hdl, infd, flags->byteswap); } } *cp = '@'; } if (ioctl_err != 0) { switch (ioctl_errno) { case ENODEV: cp = strchr(destsnap, '@'); *cp = '\0'; zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "most recent snapshot of %s does not\n" "match incremental source"), destsnap); (void) zfs_error(hdl, EZFS_BADRESTORE, errbuf); *cp = '@'; break; case ETXTBSY: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "destination %s has been modified\n" "since most recent snapshot"), name); (void) zfs_error(hdl, EZFS_BADRESTORE, errbuf); break; case EACCES: if (flags->heal) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "key must be loaded to do a non-raw " "corrective recv on an encrypted " "dataset.")); } else if (raw && stream_wantsnewfs) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "failed to create encryption key")); } else if (raw && !stream_wantsnewfs) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "encryption key does not match " "existing key")); } else { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "inherited key must be loaded")); } (void) zfs_error(hdl, EZFS_CRYPTOFAILED, errbuf); break; case EEXIST: cp = strchr(destsnap, '@'); if (newfs) { /* it's the containing fs that exists */ *cp = '\0'; } zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "destination already exists")); (void) zfs_error_fmt(hdl, EZFS_EXISTS, dgettext(TEXT_DOMAIN, "cannot restore to %s"), destsnap); *cp = '@'; break; case EINVAL: if (embedded && !raw) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "incompatible embedded data stream " "feature with encrypted receive.")); } else if (flags->resumable) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "kernel modules must be upgraded to " "receive this stream.")); } (void) zfs_error(hdl, EZFS_BADSTREAM, errbuf); break; case ECKSUM: case ZFS_ERR_STREAM_TRUNCATED: if (flags->heal) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "corrective receive was not able to " "reconstruct the data needed for " "healing.")); else recv_ecksum_set_aux(hdl, destsnap, flags->resumable, ioctl_err == ECKSUM); (void) zfs_error(hdl, EZFS_BADSTREAM, errbuf); break; case ZFS_ERR_STREAM_LARGE_BLOCK_MISMATCH: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "incremental send stream requires -L " "(--large-block), to match previous receive.")); (void) zfs_error(hdl, EZFS_BADSTREAM, errbuf); break; case ENOTSUP: if (flags->heal) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "stream is not compatible with the " "data in the pool.")); else zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool must be upgraded to receive this " "stream.")); (void) zfs_error(hdl, EZFS_BADVERSION, errbuf); break; case EDQUOT: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "destination %s space quota exceeded."), name); (void) zfs_error(hdl, EZFS_NOSPC, errbuf); break; case ZFS_ERR_FROM_IVSET_GUID_MISSING: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "IV set guid missing. See errata %u at " "https://openzfs.github.io/openzfs-docs/msg/" "ZFS-8000-ER."), ZPOOL_ERRATA_ZOL_8308_ENCRYPTION); (void) zfs_error(hdl, EZFS_BADSTREAM, errbuf); break; case ZFS_ERR_FROM_IVSET_GUID_MISMATCH: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "IV set guid mismatch. See the 'zfs receive' " "man page section\n discussing the limitations " "of raw encrypted send streams.")); (void) zfs_error(hdl, EZFS_BADSTREAM, errbuf); break; case ZFS_ERR_SPILL_BLOCK_FLAG_MISSING: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Spill block flag missing for raw send.\n" "The zfs software on the sending system must " "be updated.")); (void) zfs_error(hdl, EZFS_BADSTREAM, errbuf); break; case ZFS_ERR_RESUME_EXISTS: cp = strchr(destsnap, '@'); if (newfs) { /* it's the containing fs that exists */ *cp = '\0'; } zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Resuming recv on existing dataset without force")); (void) zfs_error_fmt(hdl, EZFS_RESUME_EXISTS, dgettext(TEXT_DOMAIN, "cannot resume recv %s"), destsnap); *cp = '@'; break; + case E2BIG: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "zfs receive required kernel memory allocation " + "larger than the system can support. Please file " + "an issue at the OpenZFS issue tracker:\n" + "https://github.com/openzfs/zfs/issues/new")); + (void) zfs_error(hdl, EZFS_BADSTREAM, errbuf); + break; case EBUSY: if (hastoken) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "destination %s contains " "partially-complete state from " "\"zfs receive -s\"."), name); (void) zfs_error(hdl, EZFS_BUSY, errbuf); break; } zfs_fallthrough; default: (void) zfs_standard_error(hdl, ioctl_errno, errbuf); } } /* * Mount the target filesystem (if created). Also mount any * children of the target filesystem if we did a replication * receive (indicated by stream_avl being non-NULL). */ if (clp) { if (!flags->nomount) err |= changelist_postfix(clp); changelist_free(clp); } if ((newfs || stream_avl) && type == ZFS_TYPE_FILESYSTEM && !redacted) flags->domount = B_TRUE; if (prop_errflags & ZPROP_ERR_NOCLEAR) { (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "Warning: " "failed to clear unreceived properties on %s"), name); (void) fprintf(stderr, "\n"); } if (prop_errflags & ZPROP_ERR_NORESTORE) { (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "Warning: " "failed to restore original properties on %s"), name); (void) fprintf(stderr, "\n"); } if (err || ioctl_err) { err = -1; goto out; } if (flags->verbose) { char buf1[64]; char buf2[64]; uint64_t bytes = read_bytes; struct timespec delta; clock_gettime(CLOCK_MONOTONIC_RAW, &delta); if (begin_time.tv_nsec > delta.tv_nsec) { delta.tv_nsec = 1000000000 + delta.tv_nsec - begin_time.tv_nsec; delta.tv_sec -= 1; } else delta.tv_nsec -= begin_time.tv_nsec; delta.tv_sec -= begin_time.tv_sec; if (delta.tv_sec == 0 && delta.tv_nsec == 0) delta.tv_nsec = 1; double delta_f = delta.tv_sec + (delta.tv_nsec / 1e9); zfs_nicebytes(bytes, buf1, sizeof (buf1)); zfs_nicebytes(bytes / delta_f, buf2, sizeof (buf2)); (void) printf("received %s stream in %.2f seconds (%s/sec)\n", buf1, delta_f, buf2); } err = 0; out: if (prop_errors != NULL) fnvlist_free(prop_errors); if (tmp_keylocation[0] != '\0') { fnvlist_add_string(rcvprops, zfs_prop_to_name(ZFS_PROP_KEYLOCATION), tmp_keylocation); } if (newprops) fnvlist_free(rcvprops); fnvlist_free(oxprops); fnvlist_free(origprops); return (err); } /* * Check properties we were asked to override (both -o|-x) */ static boolean_t zfs_receive_checkprops(libzfs_handle_t *hdl, nvlist_t *props, const char *errbuf) { nvpair_t *nvp = NULL; zfs_prop_t prop; const char *name; while ((nvp = nvlist_next_nvpair(props, nvp)) != NULL) { name = nvpair_name(nvp); prop = zfs_name_to_prop(name); if (prop == ZPROP_USERPROP) { if (!zfs_prop_user(name)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "%s: invalid property '%s'"), errbuf, name); return (B_FALSE); } continue; } /* * "origin" is readonly but is used to receive datasets as * clones so we don't raise an error here */ if (prop == ZFS_PROP_ORIGIN) continue; /* encryption params have their own verification later */ if (prop == ZFS_PROP_ENCRYPTION || zfs_prop_encryption_key_param(prop)) continue; /* * cannot override readonly, set-once and other specific * settable properties */ if (zfs_prop_readonly(prop) || prop == ZFS_PROP_VERSION || prop == ZFS_PROP_VOLSIZE) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "%s: invalid property '%s'"), errbuf, name); return (B_FALSE); } } return (B_TRUE); } static int zfs_receive_impl(libzfs_handle_t *hdl, const char *tosnap, const char *originsnap, recvflags_t *flags, int infd, const char *sendfs, nvlist_t *stream_nv, avl_tree_t *stream_avl, char **top_zfs, const char *finalsnap, nvlist_t *cmdprops) { int err; dmu_replay_record_t drr, drr_noswap; struct drr_begin *drrb = &drr.drr_u.drr_begin; char errbuf[ERRBUFLEN]; zio_cksum_t zcksum = { { 0 } }; uint64_t featureflags; int hdrtype; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot receive")); /* check cmdline props, raise an error if they cannot be received */ if (!zfs_receive_checkprops(hdl, cmdprops, errbuf)) return (zfs_error(hdl, EZFS_BADPROP, errbuf)); if (flags->isprefix && !zfs_dataset_exists(hdl, tosnap, ZFS_TYPE_DATASET)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "specified fs " "(%s) does not exist"), tosnap); return (zfs_error(hdl, EZFS_NOENT, errbuf)); } if (originsnap && !zfs_dataset_exists(hdl, originsnap, ZFS_TYPE_DATASET)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "specified origin fs " "(%s) does not exist"), originsnap); return (zfs_error(hdl, EZFS_NOENT, errbuf)); } /* read in the BEGIN record */ if (0 != (err = recv_read(hdl, infd, &drr, sizeof (drr), B_FALSE, &zcksum))) return (err); if (drr.drr_type == DRR_END || drr.drr_type == BSWAP_32(DRR_END)) { /* It's the double end record at the end of a package */ return (ENODATA); } /* the kernel needs the non-byteswapped begin record */ drr_noswap = drr; flags->byteswap = B_FALSE; if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) { /* * We computed the checksum in the wrong byteorder in * recv_read() above; do it again correctly. */ memset(&zcksum, 0, sizeof (zio_cksum_t)); fletcher_4_incremental_byteswap(&drr, sizeof (drr), &zcksum); flags->byteswap = B_TRUE; drr.drr_type = BSWAP_32(drr.drr_type); drr.drr_payloadlen = BSWAP_32(drr.drr_payloadlen); drrb->drr_magic = BSWAP_64(drrb->drr_magic); drrb->drr_versioninfo = BSWAP_64(drrb->drr_versioninfo); drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time); drrb->drr_type = BSWAP_32(drrb->drr_type); drrb->drr_flags = BSWAP_32(drrb->drr_flags); drrb->drr_toguid = BSWAP_64(drrb->drr_toguid); drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid); } if (drrb->drr_magic != DMU_BACKUP_MAGIC || drr.drr_type != DRR_BEGIN) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid " "stream (bad magic number)")); return (zfs_error(hdl, EZFS_BADSTREAM, errbuf)); } featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo); hdrtype = DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo); if (!DMU_STREAM_SUPPORTED(featureflags) || (hdrtype != DMU_SUBSTREAM && hdrtype != DMU_COMPOUNDSTREAM)) { /* * Let's be explicit about this one, since rather than * being a new feature we can't know, it's an old * feature we dropped. */ if (featureflags & DMU_BACKUP_FEATURE_DEDUP) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "stream has deprecated feature: dedup, try " "'zstream redup [send in a file] | zfs recv " "[...]'")); } else { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "stream has unsupported feature, feature flags = " "%llx (unknown flags = %llx)"), (u_longlong_t)featureflags, (u_longlong_t)((featureflags) & ~DMU_BACKUP_FEATURE_MASK)); } return (zfs_error(hdl, EZFS_BADSTREAM, errbuf)); } /* Holds feature is set once in the compound stream header. */ if (featureflags & DMU_BACKUP_FEATURE_HOLDS) flags->holds = B_TRUE; if (strchr(drrb->drr_toname, '@') == NULL) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid " "stream (bad snapshot name)")); return (zfs_error(hdl, EZFS_BADSTREAM, errbuf)); } if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) == DMU_SUBSTREAM) { char nonpackage_sendfs[ZFS_MAX_DATASET_NAME_LEN]; if (sendfs == NULL) { /* * We were not called from zfs_receive_package(). Get * the fs specified by 'zfs send'. */ char *cp; (void) strlcpy(nonpackage_sendfs, drr.drr_u.drr_begin.drr_toname, sizeof (nonpackage_sendfs)); if ((cp = strchr(nonpackage_sendfs, '@')) != NULL) *cp = '\0'; sendfs = nonpackage_sendfs; VERIFY(finalsnap == NULL); } return (zfs_receive_one(hdl, infd, tosnap, originsnap, flags, &drr, &drr_noswap, sendfs, stream_nv, stream_avl, top_zfs, finalsnap, cmdprops)); } else { assert(DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) == DMU_COMPOUNDSTREAM); return (zfs_receive_package(hdl, infd, tosnap, flags, &drr, &zcksum, top_zfs, cmdprops)); } } /* * Restores a backup of tosnap from the file descriptor specified by infd. * Return 0 on total success, -2 if some things couldn't be * destroyed/renamed/promoted, -1 if some things couldn't be received. * (-1 will override -2, if -1 and the resumable flag was specified the * transfer can be resumed if the sending side supports it). */ int zfs_receive(libzfs_handle_t *hdl, const char *tosnap, nvlist_t *props, recvflags_t *flags, int infd, avl_tree_t *stream_avl) { char *top_zfs = NULL; int err; struct stat sb; char *originsnap = NULL; /* * The only way fstat can fail is if we do not have a valid file * descriptor. */ if (fstat(infd, &sb) == -1) { perror("fstat"); return (-2); } if (props) { err = nvlist_lookup_string(props, "origin", &originsnap); if (err && err != ENOENT) return (err); } err = zfs_receive_impl(hdl, tosnap, originsnap, flags, infd, NULL, NULL, stream_avl, &top_zfs, NULL, props); if (err == 0 && !flags->nomount && flags->domount && top_zfs) { zfs_handle_t *zhp = NULL; prop_changelist_t *clp = NULL; zhp = zfs_open(hdl, top_zfs, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); if (zhp == NULL) { err = -1; goto out; } else { if (zhp->zfs_type == ZFS_TYPE_VOLUME) { zfs_close(zhp); goto out; } clp = changelist_gather(zhp, ZFS_PROP_MOUNTPOINT, CL_GATHER_MOUNT_ALWAYS, flags->forceunmount ? MS_FORCE : 0); zfs_close(zhp); if (clp == NULL) { err = -1; goto out; } /* mount and share received datasets */ err = changelist_postfix(clp); changelist_free(clp); if (err != 0) err = -1; } } out: if (top_zfs) free(top_zfs); return (err); } diff --git a/module/zfs/dmu_recv.c b/module/zfs/dmu_recv.c index 9461643ceef8..61cfe36515a3 100644 --- a/module/zfs/dmu_recv.c +++ b/module/zfs/dmu_recv.c @@ -1,3743 +1,3754 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright (c) 2014, Joyent, Inc. All rights reserved. * Copyright 2014 HybridCluster. All rights reserved. * Copyright (c) 2018, loli10K . All rights reserved. * Copyright (c) 2019, Klara Inc. * Copyright (c) 2019, Allan Jude * Copyright (c) 2019 Datto Inc. * Copyright (c) 2022 Axcient. */ +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef _KERNEL #include #endif #include static uint_t zfs_recv_queue_length = SPA_MAXBLOCKSIZE; static uint_t zfs_recv_queue_ff = 20; static uint_t zfs_recv_write_batch_size = 1024 * 1024; static int zfs_recv_best_effort_corrective = 0; static const void *const dmu_recv_tag = "dmu_recv_tag"; const char *const recv_clone_name = "%recv"; static int receive_read_payload_and_next_header(dmu_recv_cookie_t *ra, int len, void *buf); struct receive_record_arg { dmu_replay_record_t header; void *payload; /* Pointer to a buffer containing the payload */ /* * If the record is a WRITE or SPILL, pointer to the abd containing the * payload. */ abd_t *abd; int payload_size; uint64_t bytes_read; /* bytes read from stream when record created */ boolean_t eos_marker; /* Marks the end of the stream */ bqueue_node_t node; }; struct receive_writer_arg { objset_t *os; boolean_t byteswap; bqueue_t q; /* * These three members are used to signal to the main thread when * we're done. */ kmutex_t mutex; kcondvar_t cv; boolean_t done; int err; const char *tofs; boolean_t heal; boolean_t resumable; boolean_t raw; /* DMU_BACKUP_FEATURE_RAW set */ boolean_t spill; /* DRR_FLAG_SPILL_BLOCK set */ boolean_t full; /* this is a full send stream */ uint64_t last_object; uint64_t last_offset; uint64_t max_object; /* highest object ID referenced in stream */ uint64_t bytes_read; /* bytes read when current record created */ list_t write_batch; /* Encryption parameters for the last received DRR_OBJECT_RANGE */ boolean_t or_crypt_params_present; uint64_t or_firstobj; uint64_t or_numslots; uint8_t or_salt[ZIO_DATA_SALT_LEN]; uint8_t or_iv[ZIO_DATA_IV_LEN]; uint8_t or_mac[ZIO_DATA_MAC_LEN]; boolean_t or_byteorder; zio_t *heal_pio; }; typedef struct dmu_recv_begin_arg { const char *drba_origin; dmu_recv_cookie_t *drba_cookie; cred_t *drba_cred; proc_t *drba_proc; dsl_crypto_params_t *drba_dcp; } dmu_recv_begin_arg_t; static void byteswap_record(dmu_replay_record_t *drr) { #define DO64(X) (drr->drr_u.X = BSWAP_64(drr->drr_u.X)) #define DO32(X) (drr->drr_u.X = BSWAP_32(drr->drr_u.X)) drr->drr_type = BSWAP_32(drr->drr_type); drr->drr_payloadlen = BSWAP_32(drr->drr_payloadlen); switch (drr->drr_type) { case DRR_BEGIN: DO64(drr_begin.drr_magic); DO64(drr_begin.drr_versioninfo); DO64(drr_begin.drr_creation_time); DO32(drr_begin.drr_type); DO32(drr_begin.drr_flags); DO64(drr_begin.drr_toguid); DO64(drr_begin.drr_fromguid); break; case DRR_OBJECT: DO64(drr_object.drr_object); DO32(drr_object.drr_type); DO32(drr_object.drr_bonustype); DO32(drr_object.drr_blksz); DO32(drr_object.drr_bonuslen); DO32(drr_object.drr_raw_bonuslen); DO64(drr_object.drr_toguid); DO64(drr_object.drr_maxblkid); break; case DRR_FREEOBJECTS: DO64(drr_freeobjects.drr_firstobj); DO64(drr_freeobjects.drr_numobjs); DO64(drr_freeobjects.drr_toguid); break; case DRR_WRITE: DO64(drr_write.drr_object); DO32(drr_write.drr_type); DO64(drr_write.drr_offset); DO64(drr_write.drr_logical_size); DO64(drr_write.drr_toguid); ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_write.drr_key.ddk_cksum); DO64(drr_write.drr_key.ddk_prop); DO64(drr_write.drr_compressed_size); break; case DRR_WRITE_EMBEDDED: DO64(drr_write_embedded.drr_object); DO64(drr_write_embedded.drr_offset); DO64(drr_write_embedded.drr_length); DO64(drr_write_embedded.drr_toguid); DO32(drr_write_embedded.drr_lsize); DO32(drr_write_embedded.drr_psize); break; case DRR_FREE: DO64(drr_free.drr_object); DO64(drr_free.drr_offset); DO64(drr_free.drr_length); DO64(drr_free.drr_toguid); break; case DRR_SPILL: DO64(drr_spill.drr_object); DO64(drr_spill.drr_length); DO64(drr_spill.drr_toguid); DO64(drr_spill.drr_compressed_size); DO32(drr_spill.drr_type); break; case DRR_OBJECT_RANGE: DO64(drr_object_range.drr_firstobj); DO64(drr_object_range.drr_numslots); DO64(drr_object_range.drr_toguid); break; case DRR_REDACT: DO64(drr_redact.drr_object); DO64(drr_redact.drr_offset); DO64(drr_redact.drr_length); DO64(drr_redact.drr_toguid); break; case DRR_END: DO64(drr_end.drr_toguid); ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_end.drr_checksum); break; default: break; } if (drr->drr_type != DRR_BEGIN) { ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_checksum.drr_checksum); } #undef DO64 #undef DO32 } static boolean_t redact_snaps_contains(uint64_t *snaps, uint64_t num_snaps, uint64_t guid) { for (int i = 0; i < num_snaps; i++) { if (snaps[i] == guid) return (B_TRUE); } return (B_FALSE); } /* * Check that the new stream we're trying to receive is redacted with respect to * a subset of the snapshots that the origin was redacted with respect to. For * the reasons behind this, see the man page on redacted zfs sends and receives. */ static boolean_t compatible_redact_snaps(uint64_t *origin_snaps, uint64_t origin_num_snaps, uint64_t *redact_snaps, uint64_t num_redact_snaps) { /* * Short circuit the comparison; if we are redacted with respect to * more snapshots than the origin, we can't be redacted with respect * to a subset. */ if (num_redact_snaps > origin_num_snaps) { return (B_FALSE); } for (int i = 0; i < num_redact_snaps; i++) { if (!redact_snaps_contains(origin_snaps, origin_num_snaps, redact_snaps[i])) { return (B_FALSE); } } return (B_TRUE); } static boolean_t redact_check(dmu_recv_begin_arg_t *drba, dsl_dataset_t *origin) { uint64_t *origin_snaps; uint64_t origin_num_snaps; dmu_recv_cookie_t *drc = drba->drba_cookie; struct drr_begin *drrb = drc->drc_drrb; int featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo); int err = 0; boolean_t ret = B_TRUE; uint64_t *redact_snaps; uint_t numredactsnaps; /* * If this is a full send stream, we're safe no matter what. */ if (drrb->drr_fromguid == 0) return (ret); VERIFY(dsl_dataset_get_uint64_array_feature(origin, SPA_FEATURE_REDACTED_DATASETS, &origin_num_snaps, &origin_snaps)); if (nvlist_lookup_uint64_array(drc->drc_begin_nvl, BEGINNV_REDACT_FROM_SNAPS, &redact_snaps, &numredactsnaps) == 0) { /* * If the send stream was sent from the redaction bookmark or * the redacted version of the dataset, then we're safe. Verify * that this is from the a compatible redaction bookmark or * redacted dataset. */ if (!compatible_redact_snaps(origin_snaps, origin_num_snaps, redact_snaps, numredactsnaps)) { err = EINVAL; } } else if (featureflags & DMU_BACKUP_FEATURE_REDACTED) { /* * If the stream is redacted, it must be redacted with respect * to a subset of what the origin is redacted with respect to. * See case number 2 in the zfs man page section on redacted zfs * send. */ err = nvlist_lookup_uint64_array(drc->drc_begin_nvl, BEGINNV_REDACT_SNAPS, &redact_snaps, &numredactsnaps); if (err != 0 || !compatible_redact_snaps(origin_snaps, origin_num_snaps, redact_snaps, numredactsnaps)) { err = EINVAL; } } else if (!redact_snaps_contains(origin_snaps, origin_num_snaps, drrb->drr_toguid)) { /* * If the stream isn't redacted but the origin is, this must be * one of the snapshots the origin is redacted with respect to. * See case number 1 in the zfs man page section on redacted zfs * send. */ err = EINVAL; } if (err != 0) ret = B_FALSE; return (ret); } /* * If we previously received a stream with --large-block, we don't support * receiving an incremental on top of it without --large-block. This avoids * forcing a read-modify-write or trying to re-aggregate a string of WRITE * records. */ static int recv_check_large_blocks(dsl_dataset_t *ds, uint64_t featureflags) { if (dsl_dataset_feature_is_active(ds, SPA_FEATURE_LARGE_BLOCKS) && !(featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS)) return (SET_ERROR(ZFS_ERR_STREAM_LARGE_BLOCK_MISMATCH)); return (0); } static int recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds, uint64_t fromguid, uint64_t featureflags) { uint64_t obj; uint64_t children; int error; dsl_dataset_t *snap; dsl_pool_t *dp = ds->ds_dir->dd_pool; boolean_t encrypted = ds->ds_dir->dd_crypto_obj != 0; boolean_t raw = (featureflags & DMU_BACKUP_FEATURE_RAW) != 0; boolean_t embed = (featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) != 0; /* Temporary clone name must not exist. */ error = zap_lookup(dp->dp_meta_objset, dsl_dir_phys(ds->ds_dir)->dd_child_dir_zapobj, recv_clone_name, 8, 1, &obj); if (error != ENOENT) return (error == 0 ? SET_ERROR(EBUSY) : error); /* Resume state must not be set. */ if (dsl_dataset_has_resume_receive_state(ds)) return (SET_ERROR(EBUSY)); /* New snapshot name must not exist if we're not healing it. */ error = zap_lookup(dp->dp_meta_objset, dsl_dataset_phys(ds)->ds_snapnames_zapobj, drba->drba_cookie->drc_tosnap, 8, 1, &obj); if (drba->drba_cookie->drc_heal) { if (error != 0) return (error); } else if (error != ENOENT) { return (error == 0 ? SET_ERROR(EEXIST) : error); } /* Must not have children if receiving a ZVOL. */ error = zap_count(dp->dp_meta_objset, dsl_dir_phys(ds->ds_dir)->dd_child_dir_zapobj, &children); if (error != 0) return (error); if (drba->drba_cookie->drc_drrb->drr_type != DMU_OST_ZFS && children > 0) return (SET_ERROR(ZFS_ERR_WRONG_PARENT)); /* * Check snapshot limit before receiving. We'll recheck again at the * end, but might as well abort before receiving if we're already over * the limit. * * Note that we do not check the file system limit with * dsl_dir_fscount_check because the temporary %clones don't count * against that limit. */ error = dsl_fs_ss_limit_check(ds->ds_dir, 1, ZFS_PROP_SNAPSHOT_LIMIT, NULL, drba->drba_cred, drba->drba_proc); if (error != 0) return (error); if (drba->drba_cookie->drc_heal) { /* Encryption is incompatible with embedded data. */ if (encrypted && embed) return (SET_ERROR(EINVAL)); /* Healing is not supported when in 'force' mode. */ if (drba->drba_cookie->drc_force) return (SET_ERROR(EINVAL)); /* Must have keys loaded if doing encrypted non-raw recv. */ if (encrypted && !raw) { if (spa_keystore_lookup_key(dp->dp_spa, ds->ds_object, NULL, NULL) != 0) return (SET_ERROR(EACCES)); } error = dsl_dataset_hold_obj(dp, obj, FTAG, &snap); if (error != 0) return (error); /* * When not doing best effort corrective recv healing can only * be done if the send stream is for the same snapshot as the * one we are trying to heal. */ if (zfs_recv_best_effort_corrective == 0 && drba->drba_cookie->drc_drrb->drr_toguid != dsl_dataset_phys(snap)->ds_guid) { dsl_dataset_rele(snap, FTAG); return (SET_ERROR(ENOTSUP)); } dsl_dataset_rele(snap, FTAG); } else if (fromguid != 0) { /* Sanity check the incremental recv */ uint64_t obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; /* Can't perform a raw receive on top of a non-raw receive */ if (!encrypted && raw) return (SET_ERROR(EINVAL)); /* Encryption is incompatible with embedded data */ if (encrypted && embed) return (SET_ERROR(EINVAL)); /* Find snapshot in this dir that matches fromguid. */ while (obj != 0) { error = dsl_dataset_hold_obj(dp, obj, FTAG, &snap); if (error != 0) return (SET_ERROR(ENODEV)); if (snap->ds_dir != ds->ds_dir) { dsl_dataset_rele(snap, FTAG); return (SET_ERROR(ENODEV)); } if (dsl_dataset_phys(snap)->ds_guid == fromguid) break; obj = dsl_dataset_phys(snap)->ds_prev_snap_obj; dsl_dataset_rele(snap, FTAG); } if (obj == 0) return (SET_ERROR(ENODEV)); if (drba->drba_cookie->drc_force) { drba->drba_cookie->drc_fromsnapobj = obj; } else { /* * If we are not forcing, there must be no * changes since fromsnap. Raw sends have an * additional constraint that requires that * no "noop" snapshots exist between fromsnap * and tosnap for the IVset checking code to * work properly. */ if (dsl_dataset_modified_since_snap(ds, snap) || (raw && dsl_dataset_phys(ds)->ds_prev_snap_obj != snap->ds_object)) { dsl_dataset_rele(snap, FTAG); return (SET_ERROR(ETXTBSY)); } drba->drba_cookie->drc_fromsnapobj = ds->ds_prev->ds_object; } if (dsl_dataset_feature_is_active(snap, SPA_FEATURE_REDACTED_DATASETS) && !redact_check(drba, snap)) { dsl_dataset_rele(snap, FTAG); return (SET_ERROR(EINVAL)); } error = recv_check_large_blocks(snap, featureflags); if (error != 0) { dsl_dataset_rele(snap, FTAG); return (error); } dsl_dataset_rele(snap, FTAG); } else { /* If full and not healing then must be forced. */ if (!drba->drba_cookie->drc_force) return (SET_ERROR(EEXIST)); /* * We don't support using zfs recv -F to blow away * encrypted filesystems. This would require the * dsl dir to point to the old encryption key and * the new one at the same time during the receive. */ if ((!encrypted && raw) || encrypted) return (SET_ERROR(EINVAL)); /* * Perform the same encryption checks we would if * we were creating a new dataset from scratch. */ if (!raw) { boolean_t will_encrypt; error = dmu_objset_create_crypt_check( ds->ds_dir->dd_parent, drba->drba_dcp, &will_encrypt); if (error != 0) return (error); if (will_encrypt && embed) return (SET_ERROR(EINVAL)); } } return (0); } /* * Check that any feature flags used in the data stream we're receiving are * supported by the pool we are receiving into. * * Note that some of the features we explicitly check here have additional * (implicit) features they depend on, but those dependencies are enforced * through the zfeature_register() calls declaring the features that we * explicitly check. */ static int recv_begin_check_feature_flags_impl(uint64_t featureflags, spa_t *spa) { /* * Check if there are any unsupported feature flags. */ if (!DMU_STREAM_SUPPORTED(featureflags)) { return (SET_ERROR(ZFS_ERR_UNKNOWN_SEND_STREAM_FEATURE)); } /* Verify pool version supports SA if SA_SPILL feature set */ if ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) && spa_version(spa) < SPA_VERSION_SA) return (SET_ERROR(ENOTSUP)); /* * LZ4 compressed, ZSTD compressed, embedded, mooched, large blocks, * and large_dnodes in the stream can only be used if those pool * features are enabled because we don't attempt to decompress / * un-embed / un-mooch / split up the blocks / dnodes during the * receive process. */ if ((featureflags & DMU_BACKUP_FEATURE_LZ4) && !spa_feature_is_enabled(spa, SPA_FEATURE_LZ4_COMPRESS)) return (SET_ERROR(ENOTSUP)); if ((featureflags & DMU_BACKUP_FEATURE_ZSTD) && !spa_feature_is_enabled(spa, SPA_FEATURE_ZSTD_COMPRESS)) return (SET_ERROR(ENOTSUP)); if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) && !spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) return (SET_ERROR(ENOTSUP)); if ((featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) && !spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) return (SET_ERROR(ENOTSUP)); if ((featureflags & DMU_BACKUP_FEATURE_LARGE_DNODE) && !spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE)) return (SET_ERROR(ENOTSUP)); /* * Receiving redacted streams requires that redacted datasets are * enabled. */ if ((featureflags & DMU_BACKUP_FEATURE_REDACTED) && !spa_feature_is_enabled(spa, SPA_FEATURE_REDACTED_DATASETS)) return (SET_ERROR(ENOTSUP)); return (0); } static int dmu_recv_begin_check(void *arg, dmu_tx_t *tx) { dmu_recv_begin_arg_t *drba = arg; dsl_pool_t *dp = dmu_tx_pool(tx); struct drr_begin *drrb = drba->drba_cookie->drc_drrb; uint64_t fromguid = drrb->drr_fromguid; int flags = drrb->drr_flags; ds_hold_flags_t dsflags = DS_HOLD_FLAG_NONE; int error; uint64_t featureflags = drba->drba_cookie->drc_featureflags; dsl_dataset_t *ds; const char *tofs = drba->drba_cookie->drc_tofs; /* already checked */ ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC); ASSERT(!(featureflags & DMU_BACKUP_FEATURE_RESUMING)); if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) == DMU_COMPOUNDSTREAM || drrb->drr_type >= DMU_OST_NUMTYPES || ((flags & DRR_FLAG_CLONE) && drba->drba_origin == NULL)) return (SET_ERROR(EINVAL)); error = recv_begin_check_feature_flags_impl(featureflags, dp->dp_spa); if (error != 0) return (error); /* Resumable receives require extensible datasets */ if (drba->drba_cookie->drc_resumable && !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EXTENSIBLE_DATASET)) return (SET_ERROR(ENOTSUP)); if (featureflags & DMU_BACKUP_FEATURE_RAW) { /* raw receives require the encryption feature */ if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_ENCRYPTION)) return (SET_ERROR(ENOTSUP)); /* embedded data is incompatible with encryption and raw recv */ if (featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) return (SET_ERROR(EINVAL)); /* raw receives require spill block allocation flag */ if (!(flags & DRR_FLAG_SPILL_BLOCK)) return (SET_ERROR(ZFS_ERR_SPILL_BLOCK_FLAG_MISSING)); } else { /* * We support unencrypted datasets below encrypted ones now, * so add the DS_HOLD_FLAG_DECRYPT flag only if we are dealing * with a dataset we may encrypt. */ if (drba->drba_dcp == NULL || drba->drba_dcp->cp_crypt != ZIO_CRYPT_OFF) { dsflags |= DS_HOLD_FLAG_DECRYPT; } } error = dsl_dataset_hold_flags(dp, tofs, dsflags, FTAG, &ds); if (error == 0) { /* target fs already exists; recv into temp clone */ /* Can't recv a clone into an existing fs */ if (flags & DRR_FLAG_CLONE || drba->drba_origin) { dsl_dataset_rele_flags(ds, dsflags, FTAG); return (SET_ERROR(EINVAL)); } error = recv_begin_check_existing_impl(drba, ds, fromguid, featureflags); dsl_dataset_rele_flags(ds, dsflags, FTAG); } else if (error == ENOENT) { /* target fs does not exist; must be a full backup or clone */ char buf[ZFS_MAX_DATASET_NAME_LEN]; objset_t *os; /* healing recv must be done "into" an existing snapshot */ if (drba->drba_cookie->drc_heal == B_TRUE) return (SET_ERROR(ENOTSUP)); /* * If it's a non-clone incremental, we are missing the * target fs, so fail the recv. */ if (fromguid != 0 && !((flags & DRR_FLAG_CLONE) || drba->drba_origin)) return (SET_ERROR(ENOENT)); /* * If we're receiving a full send as a clone, and it doesn't * contain all the necessary free records and freeobject * records, reject it. */ if (fromguid == 0 && drba->drba_origin != NULL && !(flags & DRR_FLAG_FREERECORDS)) return (SET_ERROR(EINVAL)); /* Open the parent of tofs */ ASSERT3U(strlen(tofs), <, sizeof (buf)); (void) strlcpy(buf, tofs, strrchr(tofs, '/') - tofs + 1); error = dsl_dataset_hold(dp, buf, FTAG, &ds); if (error != 0) return (error); if ((featureflags & DMU_BACKUP_FEATURE_RAW) == 0 && drba->drba_origin == NULL) { boolean_t will_encrypt; /* * Check that we aren't breaking any encryption rules * and that we have all the parameters we need to * create an encrypted dataset if necessary. If we are * making an encrypted dataset the stream can't have * embedded data. */ error = dmu_objset_create_crypt_check(ds->ds_dir, drba->drba_dcp, &will_encrypt); if (error != 0) { dsl_dataset_rele(ds, FTAG); return (error); } if (will_encrypt && (featureflags & DMU_BACKUP_FEATURE_EMBED_DATA)) { dsl_dataset_rele(ds, FTAG); return (SET_ERROR(EINVAL)); } } /* * Check filesystem and snapshot limits before receiving. We'll * recheck snapshot limits again at the end (we create the * filesystems and increment those counts during begin_sync). */ error = dsl_fs_ss_limit_check(ds->ds_dir, 1, ZFS_PROP_FILESYSTEM_LIMIT, NULL, drba->drba_cred, drba->drba_proc); if (error != 0) { dsl_dataset_rele(ds, FTAG); return (error); } error = dsl_fs_ss_limit_check(ds->ds_dir, 1, ZFS_PROP_SNAPSHOT_LIMIT, NULL, drba->drba_cred, drba->drba_proc); if (error != 0) { dsl_dataset_rele(ds, FTAG); return (error); } /* can't recv below anything but filesystems (eg. no ZVOLs) */ error = dmu_objset_from_ds(ds, &os); if (error != 0) { dsl_dataset_rele(ds, FTAG); return (error); } if (dmu_objset_type(os) != DMU_OST_ZFS) { dsl_dataset_rele(ds, FTAG); return (SET_ERROR(ZFS_ERR_WRONG_PARENT)); } if (drba->drba_origin != NULL) { dsl_dataset_t *origin; error = dsl_dataset_hold_flags(dp, drba->drba_origin, dsflags, FTAG, &origin); if (error != 0) { dsl_dataset_rele(ds, FTAG); return (error); } if (!origin->ds_is_snapshot) { dsl_dataset_rele_flags(origin, dsflags, FTAG); dsl_dataset_rele(ds, FTAG); return (SET_ERROR(EINVAL)); } if (dsl_dataset_phys(origin)->ds_guid != fromguid && fromguid != 0) { dsl_dataset_rele_flags(origin, dsflags, FTAG); dsl_dataset_rele(ds, FTAG); return (SET_ERROR(ENODEV)); } if (origin->ds_dir->dd_crypto_obj != 0 && (featureflags & DMU_BACKUP_FEATURE_EMBED_DATA)) { dsl_dataset_rele_flags(origin, dsflags, FTAG); dsl_dataset_rele(ds, FTAG); return (SET_ERROR(EINVAL)); } /* * If the origin is redacted we need to verify that this * send stream can safely be received on top of the * origin. */ if (dsl_dataset_feature_is_active(origin, SPA_FEATURE_REDACTED_DATASETS)) { if (!redact_check(drba, origin)) { dsl_dataset_rele_flags(origin, dsflags, FTAG); dsl_dataset_rele_flags(ds, dsflags, FTAG); return (SET_ERROR(EINVAL)); } } error = recv_check_large_blocks(ds, featureflags); if (error != 0) { dsl_dataset_rele_flags(origin, dsflags, FTAG); dsl_dataset_rele_flags(ds, dsflags, FTAG); return (error); } dsl_dataset_rele_flags(origin, dsflags, FTAG); } dsl_dataset_rele(ds, FTAG); error = 0; } return (error); } static void dmu_recv_begin_sync(void *arg, dmu_tx_t *tx) { dmu_recv_begin_arg_t *drba = arg; dsl_pool_t *dp = dmu_tx_pool(tx); objset_t *mos = dp->dp_meta_objset; dmu_recv_cookie_t *drc = drba->drba_cookie; struct drr_begin *drrb = drc->drc_drrb; const char *tofs = drc->drc_tofs; uint64_t featureflags = drc->drc_featureflags; dsl_dataset_t *ds, *newds; objset_t *os; uint64_t dsobj; ds_hold_flags_t dsflags = DS_HOLD_FLAG_NONE; int error; uint64_t crflags = 0; dsl_crypto_params_t dummy_dcp = { 0 }; dsl_crypto_params_t *dcp = drba->drba_dcp; if (drrb->drr_flags & DRR_FLAG_CI_DATA) crflags |= DS_FLAG_CI_DATASET; if ((featureflags & DMU_BACKUP_FEATURE_RAW) == 0) dsflags |= DS_HOLD_FLAG_DECRYPT; /* * Raw, non-incremental recvs always use a dummy dcp with * the raw cmd set. Raw incremental recvs do not use a dcp * since the encryption parameters are already set in stone. */ if (dcp == NULL && drrb->drr_fromguid == 0 && drba->drba_origin == NULL) { ASSERT3P(dcp, ==, NULL); dcp = &dummy_dcp; if (featureflags & DMU_BACKUP_FEATURE_RAW) dcp->cp_cmd = DCP_CMD_RAW_RECV; } error = dsl_dataset_hold_flags(dp, tofs, dsflags, FTAG, &ds); if (error == 0) { /* Create temporary clone unless we're doing corrective recv */ dsl_dataset_t *snap = NULL; if (drba->drba_cookie->drc_fromsnapobj != 0) { VERIFY0(dsl_dataset_hold_obj(dp, drba->drba_cookie->drc_fromsnapobj, FTAG, &snap)); ASSERT3P(dcp, ==, NULL); } if (drc->drc_heal) { /* When healing we want to use the provided snapshot */ VERIFY0(dsl_dataset_snap_lookup(ds, drc->drc_tosnap, &dsobj)); } else { dsobj = dsl_dataset_create_sync(ds->ds_dir, recv_clone_name, snap, crflags, drba->drba_cred, dcp, tx); } if (drba->drba_cookie->drc_fromsnapobj != 0) dsl_dataset_rele(snap, FTAG); dsl_dataset_rele_flags(ds, dsflags, FTAG); } else { dsl_dir_t *dd; const char *tail; dsl_dataset_t *origin = NULL; VERIFY0(dsl_dir_hold(dp, tofs, FTAG, &dd, &tail)); if (drba->drba_origin != NULL) { VERIFY0(dsl_dataset_hold(dp, drba->drba_origin, FTAG, &origin)); ASSERT3P(dcp, ==, NULL); } /* Create new dataset. */ dsobj = dsl_dataset_create_sync(dd, strrchr(tofs, '/') + 1, origin, crflags, drba->drba_cred, dcp, tx); if (origin != NULL) dsl_dataset_rele(origin, FTAG); dsl_dir_rele(dd, FTAG); drc->drc_newfs = B_TRUE; } VERIFY0(dsl_dataset_own_obj_force(dp, dsobj, dsflags, dmu_recv_tag, &newds)); if (dsl_dataset_feature_is_active(newds, SPA_FEATURE_REDACTED_DATASETS)) { /* * If the origin dataset is redacted, the child will be redacted * when we create it. We clear the new dataset's * redaction info; if it should be redacted, we'll fill * in its information later. */ dsl_dataset_deactivate_feature(newds, SPA_FEATURE_REDACTED_DATASETS, tx); } VERIFY0(dmu_objset_from_ds(newds, &os)); if (drc->drc_resumable) { dsl_dataset_zapify(newds, tx); if (drrb->drr_fromguid != 0) { VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_FROMGUID, 8, 1, &drrb->drr_fromguid, tx)); } VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_TOGUID, 8, 1, &drrb->drr_toguid, tx)); VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_TONAME, 1, strlen(drrb->drr_toname) + 1, drrb->drr_toname, tx)); uint64_t one = 1; uint64_t zero = 0; VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_OBJECT, 8, 1, &one, tx)); VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_OFFSET, 8, 1, &zero, tx)); VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_BYTES, 8, 1, &zero, tx)); if (featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) { VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_LARGEBLOCK, 8, 1, &one, tx)); } if (featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) { VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_EMBEDOK, 8, 1, &one, tx)); } if (featureflags & DMU_BACKUP_FEATURE_COMPRESSED) { VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_COMPRESSOK, 8, 1, &one, tx)); } if (featureflags & DMU_BACKUP_FEATURE_RAW) { VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_RAWOK, 8, 1, &one, tx)); } uint64_t *redact_snaps; uint_t numredactsnaps; if (nvlist_lookup_uint64_array(drc->drc_begin_nvl, BEGINNV_REDACT_FROM_SNAPS, &redact_snaps, &numredactsnaps) == 0) { VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_REDACT_BOOKMARK_SNAPS, sizeof (*redact_snaps), numredactsnaps, redact_snaps, tx)); } } /* * Usually the os->os_encrypted value is tied to the presence of a * DSL Crypto Key object in the dd. However, that will not be received * until dmu_recv_stream(), so we set the value manually for now. */ if (featureflags & DMU_BACKUP_FEATURE_RAW) { os->os_encrypted = B_TRUE; drba->drba_cookie->drc_raw = B_TRUE; } if (featureflags & DMU_BACKUP_FEATURE_REDACTED) { uint64_t *redact_snaps; uint_t numredactsnaps; VERIFY0(nvlist_lookup_uint64_array(drc->drc_begin_nvl, BEGINNV_REDACT_SNAPS, &redact_snaps, &numredactsnaps)); dsl_dataset_activate_redaction(newds, redact_snaps, numredactsnaps, tx); } dmu_buf_will_dirty(newds->ds_dbuf, tx); dsl_dataset_phys(newds)->ds_flags |= DS_FLAG_INCONSISTENT; /* * If we actually created a non-clone, we need to create the objset * in our new dataset. If this is a raw send we postpone this until * dmu_recv_stream() so that we can allocate the metadnode with the * properties from the DRR_BEGIN payload. */ rrw_enter(&newds->ds_bp_rwlock, RW_READER, FTAG); if (BP_IS_HOLE(dsl_dataset_get_blkptr(newds)) && (featureflags & DMU_BACKUP_FEATURE_RAW) == 0 && !drc->drc_heal) { (void) dmu_objset_create_impl(dp->dp_spa, newds, dsl_dataset_get_blkptr(newds), drrb->drr_type, tx); } rrw_exit(&newds->ds_bp_rwlock, FTAG); drba->drba_cookie->drc_ds = newds; drba->drba_cookie->drc_os = os; spa_history_log_internal_ds(newds, "receive", tx, " "); } static int dmu_recv_resume_begin_check(void *arg, dmu_tx_t *tx) { dmu_recv_begin_arg_t *drba = arg; dmu_recv_cookie_t *drc = drba->drba_cookie; dsl_pool_t *dp = dmu_tx_pool(tx); struct drr_begin *drrb = drc->drc_drrb; int error; ds_hold_flags_t dsflags = DS_HOLD_FLAG_NONE; dsl_dataset_t *ds; const char *tofs = drc->drc_tofs; /* already checked */ ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC); ASSERT(drc->drc_featureflags & DMU_BACKUP_FEATURE_RESUMING); if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) == DMU_COMPOUNDSTREAM || drrb->drr_type >= DMU_OST_NUMTYPES) return (SET_ERROR(EINVAL)); /* * This is mostly a sanity check since we should have already done these * checks during a previous attempt to receive the data. */ error = recv_begin_check_feature_flags_impl(drc->drc_featureflags, dp->dp_spa); if (error != 0) return (error); /* 6 extra bytes for /%recv */ char recvname[ZFS_MAX_DATASET_NAME_LEN + 6]; (void) snprintf(recvname, sizeof (recvname), "%s/%s", tofs, recv_clone_name); if (drc->drc_featureflags & DMU_BACKUP_FEATURE_RAW) { /* raw receives require spill block allocation flag */ if (!(drrb->drr_flags & DRR_FLAG_SPILL_BLOCK)) return (SET_ERROR(ZFS_ERR_SPILL_BLOCK_FLAG_MISSING)); } else { dsflags |= DS_HOLD_FLAG_DECRYPT; } boolean_t recvexist = B_TRUE; if (dsl_dataset_hold_flags(dp, recvname, dsflags, FTAG, &ds) != 0) { /* %recv does not exist; continue in tofs */ recvexist = B_FALSE; error = dsl_dataset_hold_flags(dp, tofs, dsflags, FTAG, &ds); if (error != 0) return (error); } /* * Resume of full/newfs recv on existing dataset should be done with * force flag */ if (recvexist && drrb->drr_fromguid == 0 && !drc->drc_force) { dsl_dataset_rele_flags(ds, dsflags, FTAG); return (SET_ERROR(ZFS_ERR_RESUME_EXISTS)); } /* check that ds is marked inconsistent */ if (!DS_IS_INCONSISTENT(ds)) { dsl_dataset_rele_flags(ds, dsflags, FTAG); return (SET_ERROR(EINVAL)); } /* check that there is resuming data, and that the toguid matches */ if (!dsl_dataset_is_zapified(ds)) { dsl_dataset_rele_flags(ds, dsflags, FTAG); return (SET_ERROR(EINVAL)); } uint64_t val; error = zap_lookup(dp->dp_meta_objset, ds->ds_object, DS_FIELD_RESUME_TOGUID, sizeof (val), 1, &val); if (error != 0 || drrb->drr_toguid != val) { dsl_dataset_rele_flags(ds, dsflags, FTAG); return (SET_ERROR(EINVAL)); } /* * Check if the receive is still running. If so, it will be owned. * Note that nothing else can own the dataset (e.g. after the receive * fails) because it will be marked inconsistent. */ if (dsl_dataset_has_owner(ds)) { dsl_dataset_rele_flags(ds, dsflags, FTAG); return (SET_ERROR(EBUSY)); } /* There should not be any snapshots of this fs yet. */ if (ds->ds_prev != NULL && ds->ds_prev->ds_dir == ds->ds_dir) { dsl_dataset_rele_flags(ds, dsflags, FTAG); return (SET_ERROR(EINVAL)); } /* * Note: resume point will be checked when we process the first WRITE * record. */ /* check that the origin matches */ val = 0; (void) zap_lookup(dp->dp_meta_objset, ds->ds_object, DS_FIELD_RESUME_FROMGUID, sizeof (val), 1, &val); if (drrb->drr_fromguid != val) { dsl_dataset_rele_flags(ds, dsflags, FTAG); return (SET_ERROR(EINVAL)); } if (ds->ds_prev != NULL && drrb->drr_fromguid != 0) drc->drc_fromsnapobj = ds->ds_prev->ds_object; /* * If we're resuming, and the send is redacted, then the original send * must have been redacted, and must have been redacted with respect to * the same snapshots. */ if (drc->drc_featureflags & DMU_BACKUP_FEATURE_REDACTED) { uint64_t num_ds_redact_snaps; uint64_t *ds_redact_snaps; uint_t num_stream_redact_snaps; uint64_t *stream_redact_snaps; if (nvlist_lookup_uint64_array(drc->drc_begin_nvl, BEGINNV_REDACT_SNAPS, &stream_redact_snaps, &num_stream_redact_snaps) != 0) { dsl_dataset_rele_flags(ds, dsflags, FTAG); return (SET_ERROR(EINVAL)); } if (!dsl_dataset_get_uint64_array_feature(ds, SPA_FEATURE_REDACTED_DATASETS, &num_ds_redact_snaps, &ds_redact_snaps)) { dsl_dataset_rele_flags(ds, dsflags, FTAG); return (SET_ERROR(EINVAL)); } for (int i = 0; i < num_ds_redact_snaps; i++) { if (!redact_snaps_contains(ds_redact_snaps, num_ds_redact_snaps, stream_redact_snaps[i])) { dsl_dataset_rele_flags(ds, dsflags, FTAG); return (SET_ERROR(EINVAL)); } } } error = recv_check_large_blocks(ds, drc->drc_featureflags); if (error != 0) { dsl_dataset_rele_flags(ds, dsflags, FTAG); return (error); } dsl_dataset_rele_flags(ds, dsflags, FTAG); return (0); } static void dmu_recv_resume_begin_sync(void *arg, dmu_tx_t *tx) { dmu_recv_begin_arg_t *drba = arg; dsl_pool_t *dp = dmu_tx_pool(tx); const char *tofs = drba->drba_cookie->drc_tofs; uint64_t featureflags = drba->drba_cookie->drc_featureflags; dsl_dataset_t *ds; ds_hold_flags_t dsflags = DS_HOLD_FLAG_NONE; /* 6 extra bytes for /%recv */ char recvname[ZFS_MAX_DATASET_NAME_LEN + 6]; (void) snprintf(recvname, sizeof (recvname), "%s/%s", tofs, recv_clone_name); if (featureflags & DMU_BACKUP_FEATURE_RAW) { drba->drba_cookie->drc_raw = B_TRUE; } else { dsflags |= DS_HOLD_FLAG_DECRYPT; } if (dsl_dataset_own_force(dp, recvname, dsflags, dmu_recv_tag, &ds) != 0) { /* %recv does not exist; continue in tofs */ VERIFY0(dsl_dataset_own_force(dp, tofs, dsflags, dmu_recv_tag, &ds)); drba->drba_cookie->drc_newfs = B_TRUE; } ASSERT(DS_IS_INCONSISTENT(ds)); rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); ASSERT(!BP_IS_HOLE(dsl_dataset_get_blkptr(ds)) || drba->drba_cookie->drc_raw); rrw_exit(&ds->ds_bp_rwlock, FTAG); drba->drba_cookie->drc_ds = ds; VERIFY0(dmu_objset_from_ds(ds, &drba->drba_cookie->drc_os)); drba->drba_cookie->drc_should_save = B_TRUE; spa_history_log_internal_ds(ds, "resume receive", tx, " "); } /* * NB: callers *MUST* call dmu_recv_stream() if dmu_recv_begin() * succeeds; otherwise we will leak the holds on the datasets. */ int dmu_recv_begin(char *tofs, char *tosnap, dmu_replay_record_t *drr_begin, boolean_t force, boolean_t heal, boolean_t resumable, nvlist_t *localprops, nvlist_t *hidden_args, char *origin, dmu_recv_cookie_t *drc, zfs_file_t *fp, offset_t *voffp) { dmu_recv_begin_arg_t drba = { 0 }; int err; memset(drc, 0, sizeof (dmu_recv_cookie_t)); drc->drc_drr_begin = drr_begin; drc->drc_drrb = &drr_begin->drr_u.drr_begin; drc->drc_tosnap = tosnap; drc->drc_tofs = tofs; drc->drc_force = force; drc->drc_heal = heal; drc->drc_resumable = resumable; drc->drc_cred = CRED(); drc->drc_proc = curproc; drc->drc_clone = (origin != NULL); if (drc->drc_drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) { drc->drc_byteswap = B_TRUE; (void) fletcher_4_incremental_byteswap(drr_begin, sizeof (dmu_replay_record_t), &drc->drc_cksum); byteswap_record(drr_begin); } else if (drc->drc_drrb->drr_magic == DMU_BACKUP_MAGIC) { (void) fletcher_4_incremental_native(drr_begin, sizeof (dmu_replay_record_t), &drc->drc_cksum); } else { return (SET_ERROR(EINVAL)); } drc->drc_fp = fp; drc->drc_voff = *voffp; drc->drc_featureflags = DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo); uint32_t payloadlen = drc->drc_drr_begin->drr_payloadlen; void *payload = NULL; + + /* + * Since OpenZFS 2.0.0, we have enforced a 64MB limit in userspace + * configurable via ZFS_SENDRECV_MAX_NVLIST. We enforce 256MB as a hard + * upper limit. Systems with less than 1GB of RAM will see a lower + * limit from `arc_all_memory() / 4`. + */ + if (payloadlen > (MIN((1U << 28), arc_all_memory() / 4))) + return (E2BIG); + if (payloadlen != 0) - payload = kmem_alloc(payloadlen, KM_SLEEP); + payload = vmem_alloc(payloadlen, KM_SLEEP); err = receive_read_payload_and_next_header(drc, payloadlen, payload); if (err != 0) { - kmem_free(payload, payloadlen); + vmem_free(payload, payloadlen); return (err); } if (payloadlen != 0) { err = nvlist_unpack(payload, payloadlen, &drc->drc_begin_nvl, KM_SLEEP); - kmem_free(payload, payloadlen); + vmem_free(payload, payloadlen); if (err != 0) { kmem_free(drc->drc_next_rrd, sizeof (*drc->drc_next_rrd)); return (err); } } if (drc->drc_drrb->drr_flags & DRR_FLAG_SPILL_BLOCK) drc->drc_spill = B_TRUE; drba.drba_origin = origin; drba.drba_cookie = drc; drba.drba_cred = CRED(); drba.drba_proc = curproc; if (drc->drc_featureflags & DMU_BACKUP_FEATURE_RESUMING) { err = dsl_sync_task(tofs, dmu_recv_resume_begin_check, dmu_recv_resume_begin_sync, &drba, 5, ZFS_SPACE_CHECK_NORMAL); } else { /* * For non-raw, non-incremental, non-resuming receives the * user can specify encryption parameters on the command line * with "zfs recv -o". For these receives we create a dcp and * pass it to the sync task. Creating the dcp will implicitly * remove the encryption params from the localprops nvlist, * which avoids errors when trying to set these normally * read-only properties. Any other kind of receive that * attempts to set these properties will fail as a result. */ if ((DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo) & DMU_BACKUP_FEATURE_RAW) == 0 && origin == NULL && drc->drc_drrb->drr_fromguid == 0) { err = dsl_crypto_params_create_nvlist(DCP_CMD_NONE, localprops, hidden_args, &drba.drba_dcp); } if (err == 0) { err = dsl_sync_task(tofs, dmu_recv_begin_check, dmu_recv_begin_sync, &drba, 5, ZFS_SPACE_CHECK_NORMAL); dsl_crypto_params_free(drba.drba_dcp, !!err); } } if (err != 0) { kmem_free(drc->drc_next_rrd, sizeof (*drc->drc_next_rrd)); nvlist_free(drc->drc_begin_nvl); } return (err); } /* * Holds data need for corrective recv callback */ typedef struct cr_cb_data { uint64_t size; zbookmark_phys_t zb; spa_t *spa; } cr_cb_data_t; static void corrective_read_done(zio_t *zio) { cr_cb_data_t *data = zio->io_private; /* Corruption corrected; update error log if needed */ if (zio->io_error == 0) spa_remove_error(data->spa, &data->zb); kmem_free(data, sizeof (cr_cb_data_t)); abd_free(zio->io_abd); } /* * zio_rewrite the data pointed to by bp with the data from the rrd's abd. */ static int do_corrective_recv(struct receive_writer_arg *rwa, struct drr_write *drrw, struct receive_record_arg *rrd, blkptr_t *bp) { int err; zio_t *io; zbookmark_phys_t zb; dnode_t *dn; abd_t *abd = rrd->abd; zio_cksum_t bp_cksum = bp->blk_cksum; zio_flag_t flags = ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_CANFAIL; if (rwa->raw) flags |= ZIO_FLAG_RAW; err = dnode_hold(rwa->os, drrw->drr_object, FTAG, &dn); if (err != 0) return (err); SET_BOOKMARK(&zb, dmu_objset_id(rwa->os), drrw->drr_object, 0, dbuf_whichblock(dn, 0, drrw->drr_offset)); dnode_rele(dn, FTAG); if (!rwa->raw && DRR_WRITE_COMPRESSED(drrw)) { /* Decompress the stream data */ abd_t *dabd = abd_alloc_linear( drrw->drr_logical_size, B_FALSE); err = zio_decompress_data(drrw->drr_compressiontype, abd, abd_to_buf(dabd), abd_get_size(abd), abd_get_size(dabd), NULL); if (err != 0) { abd_free(dabd); return (err); } /* Swap in the newly decompressed data into the abd */ abd_free(abd); abd = dabd; } if (!rwa->raw && BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) { /* Recompress the data */ abd_t *cabd = abd_alloc_linear(BP_GET_PSIZE(bp), B_FALSE); uint64_t csize = zio_compress_data(BP_GET_COMPRESS(bp), abd, abd_to_buf(cabd), abd_get_size(abd), rwa->os->os_complevel); abd_zero_off(cabd, csize, BP_GET_PSIZE(bp) - csize); /* Swap in newly compressed data into the abd */ abd_free(abd); abd = cabd; flags |= ZIO_FLAG_RAW_COMPRESS; } /* * The stream is not encrypted but the data on-disk is. * We need to re-encrypt the buf using the same * encryption type, salt, iv, and mac that was used to encrypt * the block previosly. */ if (!rwa->raw && BP_USES_CRYPT(bp)) { dsl_dataset_t *ds; dsl_crypto_key_t *dck = NULL; uint8_t salt[ZIO_DATA_SALT_LEN]; uint8_t iv[ZIO_DATA_IV_LEN]; uint8_t mac[ZIO_DATA_MAC_LEN]; boolean_t no_crypt = B_FALSE; dsl_pool_t *dp = dmu_objset_pool(rwa->os); abd_t *eabd = abd_alloc_linear(BP_GET_PSIZE(bp), B_FALSE); zio_crypt_decode_params_bp(bp, salt, iv); zio_crypt_decode_mac_bp(bp, mac); dsl_pool_config_enter(dp, FTAG); err = dsl_dataset_hold_flags(dp, rwa->tofs, DS_HOLD_FLAG_DECRYPT, FTAG, &ds); if (err != 0) { dsl_pool_config_exit(dp, FTAG); abd_free(eabd); return (SET_ERROR(EACCES)); } /* Look up the key from the spa's keystore */ err = spa_keystore_lookup_key(rwa->os->os_spa, zb.zb_objset, FTAG, &dck); if (err != 0) { dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); dsl_pool_config_exit(dp, FTAG); abd_free(eabd); return (SET_ERROR(EACCES)); } err = zio_do_crypt_abd(B_TRUE, &dck->dck_key, BP_GET_TYPE(bp), BP_SHOULD_BYTESWAP(bp), salt, iv, mac, abd_get_size(abd), abd, eabd, &no_crypt); spa_keystore_dsl_key_rele(rwa->os->os_spa, dck, FTAG); dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); dsl_pool_config_exit(dp, FTAG); ASSERT0(no_crypt); if (err != 0) { abd_free(eabd); return (err); } /* Swap in the newly encrypted data into the abd */ abd_free(abd); abd = eabd; /* * We want to prevent zio_rewrite() from trying to * encrypt the data again */ flags |= ZIO_FLAG_RAW_ENCRYPT; } rrd->abd = abd; io = zio_rewrite(NULL, rwa->os->os_spa, bp->blk_birth, bp, abd, BP_GET_PSIZE(bp), NULL, NULL, ZIO_PRIORITY_SYNC_WRITE, flags, &zb); ASSERT(abd_get_size(abd) == BP_GET_LSIZE(bp) || abd_get_size(abd) == BP_GET_PSIZE(bp)); /* compute new bp checksum value and make sure it matches the old one */ zio_checksum_compute(io, BP_GET_CHECKSUM(bp), abd, abd_get_size(abd)); if (!ZIO_CHECKSUM_EQUAL(bp_cksum, io->io_bp->blk_cksum)) { zio_destroy(io); if (zfs_recv_best_effort_corrective != 0) return (0); return (SET_ERROR(ECKSUM)); } /* Correct the corruption in place */ err = zio_wait(io); if (err == 0) { cr_cb_data_t *cb_data = kmem_alloc(sizeof (cr_cb_data_t), KM_SLEEP); cb_data->spa = rwa->os->os_spa; cb_data->size = drrw->drr_logical_size; cb_data->zb = zb; /* Test if healing worked by re-reading the bp */ err = zio_wait(zio_read(rwa->heal_pio, rwa->os->os_spa, bp, abd_alloc_for_io(drrw->drr_logical_size, B_FALSE), drrw->drr_logical_size, corrective_read_done, cb_data, ZIO_PRIORITY_ASYNC_READ, flags, NULL)); } if (err != 0 && zfs_recv_best_effort_corrective != 0) err = 0; return (err); } static int receive_read(dmu_recv_cookie_t *drc, int len, void *buf) { int done = 0; /* * The code doesn't rely on this (lengths being multiples of 8). See * comment in dump_bytes. */ ASSERT(len % 8 == 0 || (drc->drc_featureflags & DMU_BACKUP_FEATURE_RAW) != 0); while (done < len) { ssize_t resid = len - done; zfs_file_t *fp = drc->drc_fp; int err = zfs_file_read(fp, (char *)buf + done, len - done, &resid); if (err == 0 && resid == len - done) { /* * Note: ECKSUM or ZFS_ERR_STREAM_TRUNCATED indicates * that the receive was interrupted and can * potentially be resumed. */ err = SET_ERROR(ZFS_ERR_STREAM_TRUNCATED); } drc->drc_voff += len - done - resid; done = len - resid; if (err != 0) return (err); } drc->drc_bytes_read += len; ASSERT3U(done, ==, len); return (0); } static inline uint8_t deduce_nblkptr(dmu_object_type_t bonus_type, uint64_t bonus_size) { if (bonus_type == DMU_OT_SA) { return (1); } else { return (1 + ((DN_OLD_MAX_BONUSLEN - MIN(DN_OLD_MAX_BONUSLEN, bonus_size)) >> SPA_BLKPTRSHIFT)); } } static void save_resume_state(struct receive_writer_arg *rwa, uint64_t object, uint64_t offset, dmu_tx_t *tx) { int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; if (!rwa->resumable) return; /* * We use ds_resume_bytes[] != 0 to indicate that we need to * update this on disk, so it must not be 0. */ ASSERT(rwa->bytes_read != 0); /* * We only resume from write records, which have a valid * (non-meta-dnode) object number. */ ASSERT(object != 0); /* * For resuming to work correctly, we must receive records in order, * sorted by object,offset. This is checked by the callers, but * assert it here for good measure. */ ASSERT3U(object, >=, rwa->os->os_dsl_dataset->ds_resume_object[txgoff]); ASSERT(object != rwa->os->os_dsl_dataset->ds_resume_object[txgoff] || offset >= rwa->os->os_dsl_dataset->ds_resume_offset[txgoff]); ASSERT3U(rwa->bytes_read, >=, rwa->os->os_dsl_dataset->ds_resume_bytes[txgoff]); rwa->os->os_dsl_dataset->ds_resume_object[txgoff] = object; rwa->os->os_dsl_dataset->ds_resume_offset[txgoff] = offset; rwa->os->os_dsl_dataset->ds_resume_bytes[txgoff] = rwa->bytes_read; } static int receive_object_is_same_generation(objset_t *os, uint64_t object, dmu_object_type_t old_bonus_type, dmu_object_type_t new_bonus_type, const void *new_bonus, boolean_t *samegenp) { zfs_file_info_t zoi; int err; dmu_buf_t *old_bonus_dbuf; err = dmu_bonus_hold(os, object, FTAG, &old_bonus_dbuf); if (err != 0) return (err); err = dmu_get_file_info(os, old_bonus_type, old_bonus_dbuf->db_data, &zoi); dmu_buf_rele(old_bonus_dbuf, FTAG); if (err != 0) return (err); uint64_t old_gen = zoi.zfi_generation; err = dmu_get_file_info(os, new_bonus_type, new_bonus, &zoi); if (err != 0) return (err); uint64_t new_gen = zoi.zfi_generation; *samegenp = (old_gen == new_gen); return (0); } static int receive_handle_existing_object(const struct receive_writer_arg *rwa, const struct drr_object *drro, const dmu_object_info_t *doi, const void *bonus_data, uint64_t *object_to_hold, uint32_t *new_blksz) { uint32_t indblksz = drro->drr_indblkshift ? 1ULL << drro->drr_indblkshift : 0; int nblkptr = deduce_nblkptr(drro->drr_bonustype, drro->drr_bonuslen); uint8_t dn_slots = drro->drr_dn_slots != 0 ? drro->drr_dn_slots : DNODE_MIN_SLOTS; boolean_t do_free_range = B_FALSE; int err; *object_to_hold = drro->drr_object; /* nblkptr should be bounded by the bonus size and type */ if (rwa->raw && nblkptr != drro->drr_nblkptr) return (SET_ERROR(EINVAL)); /* * After the previous send stream, the sending system may * have freed this object, and then happened to re-allocate * this object number in a later txg. In this case, we are * receiving a different logical file, and the block size may * appear to be different. i.e. we may have a different * block size for this object than what the send stream says. * In this case we need to remove the object's contents, * so that its structure can be changed and then its contents * entirely replaced by subsequent WRITE records. * * If this is a -L (--large-block) incremental stream, and * the previous stream was not -L, the block size may appear * to increase. i.e. we may have a smaller block size for * this object than what the send stream says. In this case * we need to keep the object's contents and block size * intact, so that we don't lose parts of the object's * contents that are not changed by this incremental send * stream. * * We can distinguish between the two above cases by using * the ZPL's generation number (see * receive_object_is_same_generation()). However, we only * want to rely on the generation number when absolutely * necessary, because with raw receives, the generation is * encrypted. We also want to minimize dependence on the * ZPL, so that other types of datasets can also be received * (e.g. ZVOLs, although note that ZVOLS currently do not * reallocate their objects or change their structure). * Therefore, we check a number of different cases where we * know it is safe to discard the object's contents, before * using the ZPL's generation number to make the above * distinction. */ if (drro->drr_blksz != doi->doi_data_block_size) { if (rwa->raw) { /* * RAW streams always have large blocks, so * we are sure that the data is not needed * due to changing --large-block to be on. * Which is fortunate since the bonus buffer * (which contains the ZPL generation) is * encrypted, and the key might not be * loaded. */ do_free_range = B_TRUE; } else if (rwa->full) { /* * This is a full send stream, so it always * replaces what we have. Even if the * generation numbers happen to match, this * can not actually be the same logical file. * This is relevant when receiving a full * send as a clone. */ do_free_range = B_TRUE; } else if (drro->drr_type != DMU_OT_PLAIN_FILE_CONTENTS || doi->doi_type != DMU_OT_PLAIN_FILE_CONTENTS) { /* * PLAIN_FILE_CONTENTS are the only type of * objects that have ever been stored with * large blocks, so we don't need the special * logic below. ZAP blocks can shrink (when * there's only one block), so we don't want * to hit the error below about block size * only increasing. */ do_free_range = B_TRUE; } else if (doi->doi_max_offset <= doi->doi_data_block_size) { /* * There is only one block. We can free it, * because its contents will be replaced by a * WRITE record. This can not be the no-L -> * -L case, because the no-L case would have * resulted in multiple blocks. If we * supported -L -> no-L, it would not be safe * to free the file's contents. Fortunately, * that is not allowed (see * recv_check_large_blocks()). */ do_free_range = B_TRUE; } else { boolean_t is_same_gen; err = receive_object_is_same_generation(rwa->os, drro->drr_object, doi->doi_bonus_type, drro->drr_bonustype, bonus_data, &is_same_gen); if (err != 0) return (SET_ERROR(EINVAL)); if (is_same_gen) { /* * This is the same logical file, and * the block size must be increasing. * It could only decrease if * --large-block was changed to be * off, which is checked in * recv_check_large_blocks(). */ if (drro->drr_blksz <= doi->doi_data_block_size) return (SET_ERROR(EINVAL)); /* * We keep the existing blocksize and * contents. */ *new_blksz = doi->doi_data_block_size; } else { do_free_range = B_TRUE; } } } /* nblkptr can only decrease if the object was reallocated */ if (nblkptr < doi->doi_nblkptr) do_free_range = B_TRUE; /* number of slots can only change on reallocation */ if (dn_slots != doi->doi_dnodesize >> DNODE_SHIFT) do_free_range = B_TRUE; /* * For raw sends we also check a few other fields to * ensure we are preserving the objset structure exactly * as it was on the receive side: * - A changed indirect block size * - A smaller nlevels */ if (rwa->raw) { if (indblksz != doi->doi_metadata_block_size) do_free_range = B_TRUE; if (drro->drr_nlevels < doi->doi_indirection) do_free_range = B_TRUE; } if (do_free_range) { err = dmu_free_long_range(rwa->os, drro->drr_object, 0, DMU_OBJECT_END); if (err != 0) return (SET_ERROR(EINVAL)); } /* * The dmu does not currently support decreasing nlevels * or changing the number of dnode slots on an object. For * non-raw sends, this does not matter and the new object * can just use the previous one's nlevels. For raw sends, * however, the structure of the received dnode (including * nlevels and dnode slots) must match that of the send * side. Therefore, instead of using dmu_object_reclaim(), * we must free the object completely and call * dmu_object_claim_dnsize() instead. */ if ((rwa->raw && drro->drr_nlevels < doi->doi_indirection) || dn_slots != doi->doi_dnodesize >> DNODE_SHIFT) { err = dmu_free_long_object(rwa->os, drro->drr_object); if (err != 0) return (SET_ERROR(EINVAL)); txg_wait_synced(dmu_objset_pool(rwa->os), 0); *object_to_hold = DMU_NEW_OBJECT; } /* * For raw receives, free everything beyond the new incoming * maxblkid. Normally this would be done with a DRR_FREE * record that would come after this DRR_OBJECT record is * processed. However, for raw receives we manually set the * maxblkid from the drr_maxblkid and so we must first free * everything above that blkid to ensure the DMU is always * consistent with itself. We will never free the first block * of the object here because a maxblkid of 0 could indicate * an object with a single block or one with no blocks. This * free may be skipped when dmu_free_long_range() was called * above since it covers the entire object's contents. */ if (rwa->raw && *object_to_hold != DMU_NEW_OBJECT && !do_free_range) { err = dmu_free_long_range(rwa->os, drro->drr_object, (drro->drr_maxblkid + 1) * doi->doi_data_block_size, DMU_OBJECT_END); if (err != 0) return (SET_ERROR(EINVAL)); } return (0); } noinline static int receive_object(struct receive_writer_arg *rwa, struct drr_object *drro, void *data) { dmu_object_info_t doi; dmu_tx_t *tx; int err; uint32_t new_blksz = drro->drr_blksz; uint8_t dn_slots = drro->drr_dn_slots != 0 ? drro->drr_dn_slots : DNODE_MIN_SLOTS; if (drro->drr_type == DMU_OT_NONE || !DMU_OT_IS_VALID(drro->drr_type) || !DMU_OT_IS_VALID(drro->drr_bonustype) || drro->drr_checksumtype >= ZIO_CHECKSUM_FUNCTIONS || drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS || P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) || drro->drr_blksz < SPA_MINBLOCKSIZE || drro->drr_blksz > spa_maxblocksize(dmu_objset_spa(rwa->os)) || drro->drr_bonuslen > DN_BONUS_SIZE(spa_maxdnodesize(dmu_objset_spa(rwa->os))) || dn_slots > (spa_maxdnodesize(dmu_objset_spa(rwa->os)) >> DNODE_SHIFT)) { return (SET_ERROR(EINVAL)); } if (rwa->raw) { /* * We should have received a DRR_OBJECT_RANGE record * containing this block and stored it in rwa. */ if (drro->drr_object < rwa->or_firstobj || drro->drr_object >= rwa->or_firstobj + rwa->or_numslots || drro->drr_raw_bonuslen < drro->drr_bonuslen || drro->drr_indblkshift > SPA_MAXBLOCKSHIFT || drro->drr_nlevels > DN_MAX_LEVELS || drro->drr_nblkptr > DN_MAX_NBLKPTR || DN_SLOTS_TO_BONUSLEN(dn_slots) < drro->drr_raw_bonuslen) return (SET_ERROR(EINVAL)); } else { /* * The DRR_OBJECT_SPILL flag is valid when the DRR_BEGIN * record indicates this by setting DRR_FLAG_SPILL_BLOCK. */ if (((drro->drr_flags & ~(DRR_OBJECT_SPILL))) || (!rwa->spill && DRR_OBJECT_HAS_SPILL(drro->drr_flags))) { return (SET_ERROR(EINVAL)); } if (drro->drr_raw_bonuslen != 0 || drro->drr_nblkptr != 0 || drro->drr_indblkshift != 0 || drro->drr_nlevels != 0) { return (SET_ERROR(EINVAL)); } } err = dmu_object_info(rwa->os, drro->drr_object, &doi); if (err != 0 && err != ENOENT && err != EEXIST) return (SET_ERROR(EINVAL)); if (drro->drr_object > rwa->max_object) rwa->max_object = drro->drr_object; /* * If we are losing blkptrs or changing the block size this must * be a new file instance. We must clear out the previous file * contents before we can change this type of metadata in the dnode. * Raw receives will also check that the indirect structure of the * dnode hasn't changed. */ uint64_t object_to_hold; if (err == 0) { err = receive_handle_existing_object(rwa, drro, &doi, data, &object_to_hold, &new_blksz); if (err != 0) return (err); } else if (err == EEXIST) { /* * The object requested is currently an interior slot of a * multi-slot dnode. This will be resolved when the next txg * is synced out, since the send stream will have told us * to free this slot when we freed the associated dnode * earlier in the stream. */ txg_wait_synced(dmu_objset_pool(rwa->os), 0); if (dmu_object_info(rwa->os, drro->drr_object, NULL) != ENOENT) return (SET_ERROR(EINVAL)); /* object was freed and we are about to allocate a new one */ object_to_hold = DMU_NEW_OBJECT; } else { /* object is free and we are about to allocate a new one */ object_to_hold = DMU_NEW_OBJECT; } /* * If this is a multi-slot dnode there is a chance that this * object will expand into a slot that is already used by * another object from the previous snapshot. We must free * these objects before we attempt to allocate the new dnode. */ if (dn_slots > 1) { boolean_t need_sync = B_FALSE; for (uint64_t slot = drro->drr_object + 1; slot < drro->drr_object + dn_slots; slot++) { dmu_object_info_t slot_doi; err = dmu_object_info(rwa->os, slot, &slot_doi); if (err == ENOENT || err == EEXIST) continue; else if (err != 0) return (err); err = dmu_free_long_object(rwa->os, slot); if (err != 0) return (err); need_sync = B_TRUE; } if (need_sync) txg_wait_synced(dmu_objset_pool(rwa->os), 0); } tx = dmu_tx_create(rwa->os); dmu_tx_hold_bonus(tx, object_to_hold); dmu_tx_hold_write(tx, object_to_hold, 0, 0); err = dmu_tx_assign(tx, TXG_WAIT); if (err != 0) { dmu_tx_abort(tx); return (err); } if (object_to_hold == DMU_NEW_OBJECT) { /* Currently free, wants to be allocated */ err = dmu_object_claim_dnsize(rwa->os, drro->drr_object, drro->drr_type, new_blksz, drro->drr_bonustype, drro->drr_bonuslen, dn_slots << DNODE_SHIFT, tx); } else if (drro->drr_type != doi.doi_type || new_blksz != doi.doi_data_block_size || drro->drr_bonustype != doi.doi_bonus_type || drro->drr_bonuslen != doi.doi_bonus_size) { /* Currently allocated, but with different properties */ err = dmu_object_reclaim_dnsize(rwa->os, drro->drr_object, drro->drr_type, new_blksz, drro->drr_bonustype, drro->drr_bonuslen, dn_slots << DNODE_SHIFT, rwa->spill ? DRR_OBJECT_HAS_SPILL(drro->drr_flags) : B_FALSE, tx); } else if (rwa->spill && !DRR_OBJECT_HAS_SPILL(drro->drr_flags)) { /* * Currently allocated, the existing version of this object * may reference a spill block that is no longer allocated * at the source and needs to be freed. */ err = dmu_object_rm_spill(rwa->os, drro->drr_object, tx); } if (err != 0) { dmu_tx_commit(tx); return (SET_ERROR(EINVAL)); } if (rwa->or_crypt_params_present) { /* * Set the crypt params for the buffer associated with this * range of dnodes. This causes the blkptr_t to have the * same crypt params (byteorder, salt, iv, mac) as on the * sending side. * * Since we are committing this tx now, it is possible for * the dnode block to end up on-disk with the incorrect MAC, * if subsequent objects in this block are received in a * different txg. However, since the dataset is marked as * inconsistent, no code paths will do a non-raw read (or * decrypt the block / verify the MAC). The receive code and * scrub code can safely do raw reads and verify the * checksum. They don't need to verify the MAC. */ dmu_buf_t *db = NULL; uint64_t offset = rwa->or_firstobj * DNODE_MIN_SIZE; err = dmu_buf_hold_by_dnode(DMU_META_DNODE(rwa->os), offset, FTAG, &db, DMU_READ_PREFETCH | DMU_READ_NO_DECRYPT); if (err != 0) { dmu_tx_commit(tx); return (SET_ERROR(EINVAL)); } dmu_buf_set_crypt_params(db, rwa->or_byteorder, rwa->or_salt, rwa->or_iv, rwa->or_mac, tx); dmu_buf_rele(db, FTAG); rwa->or_crypt_params_present = B_FALSE; } dmu_object_set_checksum(rwa->os, drro->drr_object, drro->drr_checksumtype, tx); dmu_object_set_compress(rwa->os, drro->drr_object, drro->drr_compress, tx); /* handle more restrictive dnode structuring for raw recvs */ if (rwa->raw) { /* * Set the indirect block size, block shift, nlevels. * This will not fail because we ensured all of the * blocks were freed earlier if this is a new object. * For non-new objects block size and indirect block * shift cannot change and nlevels can only increase. */ ASSERT3U(new_blksz, ==, drro->drr_blksz); VERIFY0(dmu_object_set_blocksize(rwa->os, drro->drr_object, drro->drr_blksz, drro->drr_indblkshift, tx)); VERIFY0(dmu_object_set_nlevels(rwa->os, drro->drr_object, drro->drr_nlevels, tx)); /* * Set the maxblkid. This will always succeed because * we freed all blocks beyond the new maxblkid above. */ VERIFY0(dmu_object_set_maxblkid(rwa->os, drro->drr_object, drro->drr_maxblkid, tx)); } if (data != NULL) { dmu_buf_t *db; dnode_t *dn; uint32_t flags = DMU_READ_NO_PREFETCH; if (rwa->raw) flags |= DMU_READ_NO_DECRYPT; VERIFY0(dnode_hold(rwa->os, drro->drr_object, FTAG, &dn)); VERIFY0(dmu_bonus_hold_by_dnode(dn, FTAG, &db, flags)); dmu_buf_will_dirty(db, tx); ASSERT3U(db->db_size, >=, drro->drr_bonuslen); memcpy(db->db_data, data, DRR_OBJECT_PAYLOAD_SIZE(drro)); /* * Raw bonus buffers have their byteorder determined by the * DRR_OBJECT_RANGE record. */ if (rwa->byteswap && !rwa->raw) { dmu_object_byteswap_t byteswap = DMU_OT_BYTESWAP(drro->drr_bonustype); dmu_ot_byteswap[byteswap].ob_func(db->db_data, DRR_OBJECT_PAYLOAD_SIZE(drro)); } dmu_buf_rele(db, FTAG); dnode_rele(dn, FTAG); } dmu_tx_commit(tx); return (0); } noinline static int receive_freeobjects(struct receive_writer_arg *rwa, struct drr_freeobjects *drrfo) { uint64_t obj; int next_err = 0; if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj) return (SET_ERROR(EINVAL)); for (obj = drrfo->drr_firstobj == 0 ? 1 : drrfo->drr_firstobj; obj < drrfo->drr_firstobj + drrfo->drr_numobjs && obj < DN_MAX_OBJECT && next_err == 0; next_err = dmu_object_next(rwa->os, &obj, FALSE, 0)) { dmu_object_info_t doi; int err; err = dmu_object_info(rwa->os, obj, &doi); if (err == ENOENT) continue; else if (err != 0) return (err); err = dmu_free_long_object(rwa->os, obj); if (err != 0) return (err); } if (next_err != ESRCH) return (next_err); return (0); } /* * Note: if this fails, the caller will clean up any records left on the * rwa->write_batch list. */ static int flush_write_batch_impl(struct receive_writer_arg *rwa) { dnode_t *dn; int err; if (dnode_hold(rwa->os, rwa->last_object, FTAG, &dn) != 0) return (SET_ERROR(EINVAL)); struct receive_record_arg *last_rrd = list_tail(&rwa->write_batch); struct drr_write *last_drrw = &last_rrd->header.drr_u.drr_write; struct receive_record_arg *first_rrd = list_head(&rwa->write_batch); struct drr_write *first_drrw = &first_rrd->header.drr_u.drr_write; ASSERT3U(rwa->last_object, ==, last_drrw->drr_object); ASSERT3U(rwa->last_offset, ==, last_drrw->drr_offset); dmu_tx_t *tx = dmu_tx_create(rwa->os); dmu_tx_hold_write_by_dnode(tx, dn, first_drrw->drr_offset, last_drrw->drr_offset - first_drrw->drr_offset + last_drrw->drr_logical_size); err = dmu_tx_assign(tx, TXG_WAIT); if (err != 0) { dmu_tx_abort(tx); dnode_rele(dn, FTAG); return (err); } struct receive_record_arg *rrd; while ((rrd = list_head(&rwa->write_batch)) != NULL) { struct drr_write *drrw = &rrd->header.drr_u.drr_write; abd_t *abd = rrd->abd; ASSERT3U(drrw->drr_object, ==, rwa->last_object); if (drrw->drr_logical_size != dn->dn_datablksz) { /* * The WRITE record is larger than the object's block * size. We must be receiving an incremental * large-block stream into a dataset that previously did * a non-large-block receive. Lightweight writes must * be exactly one block, so we need to decompress the * data (if compressed) and do a normal dmu_write(). */ ASSERT3U(drrw->drr_logical_size, >, dn->dn_datablksz); if (DRR_WRITE_COMPRESSED(drrw)) { abd_t *decomp_abd = abd_alloc_linear(drrw->drr_logical_size, B_FALSE); err = zio_decompress_data( drrw->drr_compressiontype, abd, abd_to_buf(decomp_abd), abd_get_size(abd), abd_get_size(decomp_abd), NULL); if (err == 0) { dmu_write_by_dnode(dn, drrw->drr_offset, drrw->drr_logical_size, abd_to_buf(decomp_abd), tx); } abd_free(decomp_abd); } else { dmu_write_by_dnode(dn, drrw->drr_offset, drrw->drr_logical_size, abd_to_buf(abd), tx); } if (err == 0) abd_free(abd); } else { zio_prop_t zp; dmu_write_policy(rwa->os, dn, 0, 0, &zp); zio_flag_t zio_flags = 0; if (rwa->raw) { zp.zp_encrypt = B_TRUE; zp.zp_compress = drrw->drr_compressiontype; zp.zp_byteorder = ZFS_HOST_BYTEORDER ^ !!DRR_IS_RAW_BYTESWAPPED(drrw->drr_flags) ^ rwa->byteswap; memcpy(zp.zp_salt, drrw->drr_salt, ZIO_DATA_SALT_LEN); memcpy(zp.zp_iv, drrw->drr_iv, ZIO_DATA_IV_LEN); memcpy(zp.zp_mac, drrw->drr_mac, ZIO_DATA_MAC_LEN); if (DMU_OT_IS_ENCRYPTED(zp.zp_type)) { zp.zp_nopwrite = B_FALSE; zp.zp_copies = MIN(zp.zp_copies, SPA_DVAS_PER_BP - 1); } zio_flags |= ZIO_FLAG_RAW; } else if (DRR_WRITE_COMPRESSED(drrw)) { ASSERT3U(drrw->drr_compressed_size, >, 0); ASSERT3U(drrw->drr_logical_size, >=, drrw->drr_compressed_size); zp.zp_compress = drrw->drr_compressiontype; zio_flags |= ZIO_FLAG_RAW_COMPRESS; } else if (rwa->byteswap) { /* * Note: compressed blocks never need to be * byteswapped, because WRITE records for * metadata blocks are never compressed. The * exception is raw streams, which are written * in the original byteorder, and the byteorder * bit is preserved in the BP by setting * zp_byteorder above. */ dmu_object_byteswap_t byteswap = DMU_OT_BYTESWAP(drrw->drr_type); dmu_ot_byteswap[byteswap].ob_func( abd_to_buf(abd), DRR_WRITE_PAYLOAD_SIZE(drrw)); } /* * Since this data can't be read until the receive * completes, we can do a "lightweight" write for * improved performance. */ err = dmu_lightweight_write_by_dnode(dn, drrw->drr_offset, abd, &zp, zio_flags, tx); } if (err != 0) { /* * This rrd is left on the list, so the caller will * free it (and the abd). */ break; } /* * Note: If the receive fails, we want the resume stream to * start with the same record that we last successfully * received (as opposed to the next record), so that we can * verify that we are resuming from the correct location. */ save_resume_state(rwa, drrw->drr_object, drrw->drr_offset, tx); list_remove(&rwa->write_batch, rrd); kmem_free(rrd, sizeof (*rrd)); } dmu_tx_commit(tx); dnode_rele(dn, FTAG); return (err); } noinline static int flush_write_batch(struct receive_writer_arg *rwa) { if (list_is_empty(&rwa->write_batch)) return (0); int err = rwa->err; if (err == 0) err = flush_write_batch_impl(rwa); if (err != 0) { struct receive_record_arg *rrd; while ((rrd = list_remove_head(&rwa->write_batch)) != NULL) { abd_free(rrd->abd); kmem_free(rrd, sizeof (*rrd)); } } ASSERT(list_is_empty(&rwa->write_batch)); return (err); } noinline static int receive_process_write_record(struct receive_writer_arg *rwa, struct receive_record_arg *rrd) { int err = 0; ASSERT3U(rrd->header.drr_type, ==, DRR_WRITE); struct drr_write *drrw = &rrd->header.drr_u.drr_write; if (drrw->drr_offset + drrw->drr_logical_size < drrw->drr_offset || !DMU_OT_IS_VALID(drrw->drr_type)) return (SET_ERROR(EINVAL)); if (rwa->heal) { blkptr_t *bp; dmu_buf_t *dbp; dnode_t *dn; int flags = DB_RF_CANFAIL; if (rwa->raw) flags |= DB_RF_NO_DECRYPT; if (rwa->byteswap) { dmu_object_byteswap_t byteswap = DMU_OT_BYTESWAP(drrw->drr_type); dmu_ot_byteswap[byteswap].ob_func(abd_to_buf(rrd->abd), DRR_WRITE_PAYLOAD_SIZE(drrw)); } err = dmu_buf_hold_noread(rwa->os, drrw->drr_object, drrw->drr_offset, FTAG, &dbp); if (err != 0) return (err); /* Try to read the object to see if it needs healing */ err = dbuf_read((dmu_buf_impl_t *)dbp, NULL, flags); /* * We only try to heal when dbuf_read() returns a ECKSUMs. * Other errors (even EIO) get returned to caller. * EIO indicates that the device is not present/accessible, * so writing to it will likely fail. * If the block is healthy, we don't want to overwrite it * unnecessarily. */ if (err != ECKSUM) { dmu_buf_rele(dbp, FTAG); return (err); } dn = dmu_buf_dnode_enter(dbp); /* Make sure the on-disk block and recv record sizes match */ if (drrw->drr_logical_size != dn->dn_datablkszsec << SPA_MINBLOCKSHIFT) { err = ENOTSUP; dmu_buf_dnode_exit(dbp); dmu_buf_rele(dbp, FTAG); return (err); } /* Get the block pointer for the corrupted block */ bp = dmu_buf_get_blkptr(dbp); err = do_corrective_recv(rwa, drrw, rrd, bp); dmu_buf_dnode_exit(dbp); dmu_buf_rele(dbp, FTAG); return (err); } /* * For resuming to work, records must be in increasing order * by (object, offset). */ if (drrw->drr_object < rwa->last_object || (drrw->drr_object == rwa->last_object && drrw->drr_offset < rwa->last_offset)) { return (SET_ERROR(EINVAL)); } struct receive_record_arg *first_rrd = list_head(&rwa->write_batch); struct drr_write *first_drrw = &first_rrd->header.drr_u.drr_write; uint64_t batch_size = MIN(zfs_recv_write_batch_size, DMU_MAX_ACCESS / 2); if (first_rrd != NULL && (drrw->drr_object != first_drrw->drr_object || drrw->drr_offset >= first_drrw->drr_offset + batch_size)) { err = flush_write_batch(rwa); if (err != 0) return (err); } rwa->last_object = drrw->drr_object; rwa->last_offset = drrw->drr_offset; if (rwa->last_object > rwa->max_object) rwa->max_object = rwa->last_object; list_insert_tail(&rwa->write_batch, rrd); /* * Return EAGAIN to indicate that we will use this rrd again, * so the caller should not free it */ return (EAGAIN); } static int receive_write_embedded(struct receive_writer_arg *rwa, struct drr_write_embedded *drrwe, void *data) { dmu_tx_t *tx; int err; if (drrwe->drr_offset + drrwe->drr_length < drrwe->drr_offset) return (SET_ERROR(EINVAL)); if (drrwe->drr_psize > BPE_PAYLOAD_SIZE) return (SET_ERROR(EINVAL)); if (drrwe->drr_etype >= NUM_BP_EMBEDDED_TYPES) return (SET_ERROR(EINVAL)); if (drrwe->drr_compression >= ZIO_COMPRESS_FUNCTIONS) return (SET_ERROR(EINVAL)); if (rwa->raw) return (SET_ERROR(EINVAL)); if (drrwe->drr_object > rwa->max_object) rwa->max_object = drrwe->drr_object; tx = dmu_tx_create(rwa->os); dmu_tx_hold_write(tx, drrwe->drr_object, drrwe->drr_offset, drrwe->drr_length); err = dmu_tx_assign(tx, TXG_WAIT); if (err != 0) { dmu_tx_abort(tx); return (err); } dmu_write_embedded(rwa->os, drrwe->drr_object, drrwe->drr_offset, data, drrwe->drr_etype, drrwe->drr_compression, drrwe->drr_lsize, drrwe->drr_psize, rwa->byteswap ^ ZFS_HOST_BYTEORDER, tx); /* See comment in restore_write. */ save_resume_state(rwa, drrwe->drr_object, drrwe->drr_offset, tx); dmu_tx_commit(tx); return (0); } static int receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs, abd_t *abd) { dmu_buf_t *db, *db_spill; int err; if (drrs->drr_length < SPA_MINBLOCKSIZE || drrs->drr_length > spa_maxblocksize(dmu_objset_spa(rwa->os))) return (SET_ERROR(EINVAL)); /* * This is an unmodified spill block which was added to the stream * to resolve an issue with incorrectly removing spill blocks. It * should be ignored by current versions of the code which support * the DRR_FLAG_SPILL_BLOCK flag. */ if (rwa->spill && DRR_SPILL_IS_UNMODIFIED(drrs->drr_flags)) { abd_free(abd); return (0); } if (rwa->raw) { if (!DMU_OT_IS_VALID(drrs->drr_type) || drrs->drr_compressiontype >= ZIO_COMPRESS_FUNCTIONS || drrs->drr_compressed_size == 0) return (SET_ERROR(EINVAL)); } if (dmu_object_info(rwa->os, drrs->drr_object, NULL) != 0) return (SET_ERROR(EINVAL)); if (drrs->drr_object > rwa->max_object) rwa->max_object = drrs->drr_object; VERIFY0(dmu_bonus_hold(rwa->os, drrs->drr_object, FTAG, &db)); if ((err = dmu_spill_hold_by_bonus(db, DMU_READ_NO_DECRYPT, FTAG, &db_spill)) != 0) { dmu_buf_rele(db, FTAG); return (err); } dmu_tx_t *tx = dmu_tx_create(rwa->os); dmu_tx_hold_spill(tx, db->db_object); err = dmu_tx_assign(tx, TXG_WAIT); if (err != 0) { dmu_buf_rele(db, FTAG); dmu_buf_rele(db_spill, FTAG); dmu_tx_abort(tx); return (err); } /* * Spill blocks may both grow and shrink. When a change in size * occurs any existing dbuf must be updated to match the logical * size of the provided arc_buf_t. */ if (db_spill->db_size != drrs->drr_length) { dmu_buf_will_fill(db_spill, tx); VERIFY0(dbuf_spill_set_blksz(db_spill, drrs->drr_length, tx)); } arc_buf_t *abuf; if (rwa->raw) { boolean_t byteorder = ZFS_HOST_BYTEORDER ^ !!DRR_IS_RAW_BYTESWAPPED(drrs->drr_flags) ^ rwa->byteswap; abuf = arc_loan_raw_buf(dmu_objset_spa(rwa->os), drrs->drr_object, byteorder, drrs->drr_salt, drrs->drr_iv, drrs->drr_mac, drrs->drr_type, drrs->drr_compressed_size, drrs->drr_length, drrs->drr_compressiontype, 0); } else { abuf = arc_loan_buf(dmu_objset_spa(rwa->os), DMU_OT_IS_METADATA(drrs->drr_type), drrs->drr_length); if (rwa->byteswap) { dmu_object_byteswap_t byteswap = DMU_OT_BYTESWAP(drrs->drr_type); dmu_ot_byteswap[byteswap].ob_func(abd_to_buf(abd), DRR_SPILL_PAYLOAD_SIZE(drrs)); } } memcpy(abuf->b_data, abd_to_buf(abd), DRR_SPILL_PAYLOAD_SIZE(drrs)); abd_free(abd); dbuf_assign_arcbuf((dmu_buf_impl_t *)db_spill, abuf, tx); dmu_buf_rele(db, FTAG); dmu_buf_rele(db_spill, FTAG); dmu_tx_commit(tx); return (0); } noinline static int receive_free(struct receive_writer_arg *rwa, struct drr_free *drrf) { int err; if (drrf->drr_length != -1ULL && drrf->drr_offset + drrf->drr_length < drrf->drr_offset) return (SET_ERROR(EINVAL)); if (dmu_object_info(rwa->os, drrf->drr_object, NULL) != 0) return (SET_ERROR(EINVAL)); if (drrf->drr_object > rwa->max_object) rwa->max_object = drrf->drr_object; err = dmu_free_long_range(rwa->os, drrf->drr_object, drrf->drr_offset, drrf->drr_length); return (err); } static int receive_object_range(struct receive_writer_arg *rwa, struct drr_object_range *drror) { /* * By default, we assume this block is in our native format * (ZFS_HOST_BYTEORDER). We then take into account whether * the send stream is byteswapped (rwa->byteswap). Finally, * we need to byteswap again if this particular block was * in non-native format on the send side. */ boolean_t byteorder = ZFS_HOST_BYTEORDER ^ rwa->byteswap ^ !!DRR_IS_RAW_BYTESWAPPED(drror->drr_flags); /* * Since dnode block sizes are constant, we should not need to worry * about making sure that the dnode block size is the same on the * sending and receiving sides for the time being. For non-raw sends, * this does not matter (and in fact we do not send a DRR_OBJECT_RANGE * record at all). Raw sends require this record type because the * encryption parameters are used to protect an entire block of bonus * buffers. If the size of dnode blocks ever becomes variable, * handling will need to be added to ensure that dnode block sizes * match on the sending and receiving side. */ if (drror->drr_numslots != DNODES_PER_BLOCK || P2PHASE(drror->drr_firstobj, DNODES_PER_BLOCK) != 0 || !rwa->raw) return (SET_ERROR(EINVAL)); if (drror->drr_firstobj > rwa->max_object) rwa->max_object = drror->drr_firstobj; /* * The DRR_OBJECT_RANGE handling must be deferred to receive_object() * so that the block of dnodes is not written out when it's empty, * and converted to a HOLE BP. */ rwa->or_crypt_params_present = B_TRUE; rwa->or_firstobj = drror->drr_firstobj; rwa->or_numslots = drror->drr_numslots; memcpy(rwa->or_salt, drror->drr_salt, ZIO_DATA_SALT_LEN); memcpy(rwa->or_iv, drror->drr_iv, ZIO_DATA_IV_LEN); memcpy(rwa->or_mac, drror->drr_mac, ZIO_DATA_MAC_LEN); rwa->or_byteorder = byteorder; return (0); } /* * Until we have the ability to redact large ranges of data efficiently, we * process these records as frees. */ noinline static int receive_redact(struct receive_writer_arg *rwa, struct drr_redact *drrr) { struct drr_free drrf = {0}; drrf.drr_length = drrr->drr_length; drrf.drr_object = drrr->drr_object; drrf.drr_offset = drrr->drr_offset; drrf.drr_toguid = drrr->drr_toguid; return (receive_free(rwa, &drrf)); } /* used to destroy the drc_ds on error */ static void dmu_recv_cleanup_ds(dmu_recv_cookie_t *drc) { dsl_dataset_t *ds = drc->drc_ds; ds_hold_flags_t dsflags; dsflags = (drc->drc_raw) ? DS_HOLD_FLAG_NONE : DS_HOLD_FLAG_DECRYPT; /* * Wait for the txg sync before cleaning up the receive. For * resumable receives, this ensures that our resume state has * been written out to disk. For raw receives, this ensures * that the user accounting code will not attempt to do anything * after we stopped receiving the dataset. */ txg_wait_synced(ds->ds_dir->dd_pool, 0); ds->ds_objset->os_raw_receive = B_FALSE; rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); if (drc->drc_resumable && drc->drc_should_save && !BP_IS_HOLE(dsl_dataset_get_blkptr(ds))) { rrw_exit(&ds->ds_bp_rwlock, FTAG); dsl_dataset_disown(ds, dsflags, dmu_recv_tag); } else { char name[ZFS_MAX_DATASET_NAME_LEN]; rrw_exit(&ds->ds_bp_rwlock, FTAG); dsl_dataset_name(ds, name); dsl_dataset_disown(ds, dsflags, dmu_recv_tag); if (!drc->drc_heal) (void) dsl_destroy_head(name); } } static void receive_cksum(dmu_recv_cookie_t *drc, int len, void *buf) { if (drc->drc_byteswap) { (void) fletcher_4_incremental_byteswap(buf, len, &drc->drc_cksum); } else { (void) fletcher_4_incremental_native(buf, len, &drc->drc_cksum); } } /* * Read the payload into a buffer of size len, and update the current record's * payload field. * Allocate drc->drc_next_rrd and read the next record's header into * drc->drc_next_rrd->header. * Verify checksum of payload and next record. */ static int receive_read_payload_and_next_header(dmu_recv_cookie_t *drc, int len, void *buf) { int err; if (len != 0) { ASSERT3U(len, <=, SPA_MAXBLOCKSIZE); err = receive_read(drc, len, buf); if (err != 0) return (err); receive_cksum(drc, len, buf); /* note: rrd is NULL when reading the begin record's payload */ if (drc->drc_rrd != NULL) { drc->drc_rrd->payload = buf; drc->drc_rrd->payload_size = len; drc->drc_rrd->bytes_read = drc->drc_bytes_read; } } else { ASSERT3P(buf, ==, NULL); } drc->drc_prev_cksum = drc->drc_cksum; drc->drc_next_rrd = kmem_zalloc(sizeof (*drc->drc_next_rrd), KM_SLEEP); err = receive_read(drc, sizeof (drc->drc_next_rrd->header), &drc->drc_next_rrd->header); drc->drc_next_rrd->bytes_read = drc->drc_bytes_read; if (err != 0) { kmem_free(drc->drc_next_rrd, sizeof (*drc->drc_next_rrd)); drc->drc_next_rrd = NULL; return (err); } if (drc->drc_next_rrd->header.drr_type == DRR_BEGIN) { kmem_free(drc->drc_next_rrd, sizeof (*drc->drc_next_rrd)); drc->drc_next_rrd = NULL; return (SET_ERROR(EINVAL)); } /* * Note: checksum is of everything up to but not including the * checksum itself. */ ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t)); receive_cksum(drc, offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), &drc->drc_next_rrd->header); zio_cksum_t cksum_orig = drc->drc_next_rrd->header.drr_u.drr_checksum.drr_checksum; zio_cksum_t *cksump = &drc->drc_next_rrd->header.drr_u.drr_checksum.drr_checksum; if (drc->drc_byteswap) byteswap_record(&drc->drc_next_rrd->header); if ((!ZIO_CHECKSUM_IS_ZERO(cksump)) && !ZIO_CHECKSUM_EQUAL(drc->drc_cksum, *cksump)) { kmem_free(drc->drc_next_rrd, sizeof (*drc->drc_next_rrd)); drc->drc_next_rrd = NULL; return (SET_ERROR(ECKSUM)); } receive_cksum(drc, sizeof (cksum_orig), &cksum_orig); return (0); } /* * Issue the prefetch reads for any necessary indirect blocks. * * We use the object ignore list to tell us whether or not to issue prefetches * for a given object. We do this for both correctness (in case the blocksize * of an object has changed) and performance (if the object doesn't exist, don't * needlessly try to issue prefetches). We also trim the list as we go through * the stream to prevent it from growing to an unbounded size. * * The object numbers within will always be in sorted order, and any write * records we see will also be in sorted order, but they're not sorted with * respect to each other (i.e. we can get several object records before * receiving each object's write records). As a result, once we've reached a * given object number, we can safely remove any reference to lower object * numbers in the ignore list. In practice, we receive up to 32 object records * before receiving write records, so the list can have up to 32 nodes in it. */ static void receive_read_prefetch(dmu_recv_cookie_t *drc, uint64_t object, uint64_t offset, uint64_t length) { if (!objlist_exists(drc->drc_ignore_objlist, object)) { dmu_prefetch(drc->drc_os, object, 1, offset, length, ZIO_PRIORITY_SYNC_READ); } } /* * Read records off the stream, issuing any necessary prefetches. */ static int receive_read_record(dmu_recv_cookie_t *drc) { int err; switch (drc->drc_rrd->header.drr_type) { case DRR_OBJECT: { struct drr_object *drro = &drc->drc_rrd->header.drr_u.drr_object; uint32_t size = DRR_OBJECT_PAYLOAD_SIZE(drro); void *buf = NULL; dmu_object_info_t doi; if (size != 0) buf = kmem_zalloc(size, KM_SLEEP); err = receive_read_payload_and_next_header(drc, size, buf); if (err != 0) { kmem_free(buf, size); return (err); } err = dmu_object_info(drc->drc_os, drro->drr_object, &doi); /* * See receive_read_prefetch for an explanation why we're * storing this object in the ignore_obj_list. */ if (err == ENOENT || err == EEXIST || (err == 0 && doi.doi_data_block_size != drro->drr_blksz)) { objlist_insert(drc->drc_ignore_objlist, drro->drr_object); err = 0; } return (err); } case DRR_FREEOBJECTS: { err = receive_read_payload_and_next_header(drc, 0, NULL); return (err); } case DRR_WRITE: { struct drr_write *drrw = &drc->drc_rrd->header.drr_u.drr_write; int size = DRR_WRITE_PAYLOAD_SIZE(drrw); abd_t *abd = abd_alloc_linear(size, B_FALSE); err = receive_read_payload_and_next_header(drc, size, abd_to_buf(abd)); if (err != 0) { abd_free(abd); return (err); } drc->drc_rrd->abd = abd; receive_read_prefetch(drc, drrw->drr_object, drrw->drr_offset, drrw->drr_logical_size); return (err); } case DRR_WRITE_EMBEDDED: { struct drr_write_embedded *drrwe = &drc->drc_rrd->header.drr_u.drr_write_embedded; uint32_t size = P2ROUNDUP(drrwe->drr_psize, 8); void *buf = kmem_zalloc(size, KM_SLEEP); err = receive_read_payload_and_next_header(drc, size, buf); if (err != 0) { kmem_free(buf, size); return (err); } receive_read_prefetch(drc, drrwe->drr_object, drrwe->drr_offset, drrwe->drr_length); return (err); } case DRR_FREE: case DRR_REDACT: { /* * It might be beneficial to prefetch indirect blocks here, but * we don't really have the data to decide for sure. */ err = receive_read_payload_and_next_header(drc, 0, NULL); return (err); } case DRR_END: { struct drr_end *drre = &drc->drc_rrd->header.drr_u.drr_end; if (!ZIO_CHECKSUM_EQUAL(drc->drc_prev_cksum, drre->drr_checksum)) return (SET_ERROR(ECKSUM)); return (0); } case DRR_SPILL: { struct drr_spill *drrs = &drc->drc_rrd->header.drr_u.drr_spill; int size = DRR_SPILL_PAYLOAD_SIZE(drrs); abd_t *abd = abd_alloc_linear(size, B_FALSE); err = receive_read_payload_and_next_header(drc, size, abd_to_buf(abd)); if (err != 0) abd_free(abd); else drc->drc_rrd->abd = abd; return (err); } case DRR_OBJECT_RANGE: { err = receive_read_payload_and_next_header(drc, 0, NULL); return (err); } default: return (SET_ERROR(EINVAL)); } } static void dprintf_drr(struct receive_record_arg *rrd, int err) { #ifdef ZFS_DEBUG switch (rrd->header.drr_type) { case DRR_OBJECT: { struct drr_object *drro = &rrd->header.drr_u.drr_object; dprintf("drr_type = OBJECT obj = %llu type = %u " "bonustype = %u blksz = %u bonuslen = %u cksumtype = %u " "compress = %u dn_slots = %u err = %d\n", (u_longlong_t)drro->drr_object, drro->drr_type, drro->drr_bonustype, drro->drr_blksz, drro->drr_bonuslen, drro->drr_checksumtype, drro->drr_compress, drro->drr_dn_slots, err); break; } case DRR_FREEOBJECTS: { struct drr_freeobjects *drrfo = &rrd->header.drr_u.drr_freeobjects; dprintf("drr_type = FREEOBJECTS firstobj = %llu " "numobjs = %llu err = %d\n", (u_longlong_t)drrfo->drr_firstobj, (u_longlong_t)drrfo->drr_numobjs, err); break; } case DRR_WRITE: { struct drr_write *drrw = &rrd->header.drr_u.drr_write; dprintf("drr_type = WRITE obj = %llu type = %u offset = %llu " "lsize = %llu cksumtype = %u flags = %u " "compress = %u psize = %llu err = %d\n", (u_longlong_t)drrw->drr_object, drrw->drr_type, (u_longlong_t)drrw->drr_offset, (u_longlong_t)drrw->drr_logical_size, drrw->drr_checksumtype, drrw->drr_flags, drrw->drr_compressiontype, (u_longlong_t)drrw->drr_compressed_size, err); break; } case DRR_WRITE_BYREF: { struct drr_write_byref *drrwbr = &rrd->header.drr_u.drr_write_byref; dprintf("drr_type = WRITE_BYREF obj = %llu offset = %llu " "length = %llu toguid = %llx refguid = %llx " "refobject = %llu refoffset = %llu cksumtype = %u " "flags = %u err = %d\n", (u_longlong_t)drrwbr->drr_object, (u_longlong_t)drrwbr->drr_offset, (u_longlong_t)drrwbr->drr_length, (u_longlong_t)drrwbr->drr_toguid, (u_longlong_t)drrwbr->drr_refguid, (u_longlong_t)drrwbr->drr_refobject, (u_longlong_t)drrwbr->drr_refoffset, drrwbr->drr_checksumtype, drrwbr->drr_flags, err); break; } case DRR_WRITE_EMBEDDED: { struct drr_write_embedded *drrwe = &rrd->header.drr_u.drr_write_embedded; dprintf("drr_type = WRITE_EMBEDDED obj = %llu offset = %llu " "length = %llu compress = %u etype = %u lsize = %u " "psize = %u err = %d\n", (u_longlong_t)drrwe->drr_object, (u_longlong_t)drrwe->drr_offset, (u_longlong_t)drrwe->drr_length, drrwe->drr_compression, drrwe->drr_etype, drrwe->drr_lsize, drrwe->drr_psize, err); break; } case DRR_FREE: { struct drr_free *drrf = &rrd->header.drr_u.drr_free; dprintf("drr_type = FREE obj = %llu offset = %llu " "length = %lld err = %d\n", (u_longlong_t)drrf->drr_object, (u_longlong_t)drrf->drr_offset, (longlong_t)drrf->drr_length, err); break; } case DRR_SPILL: { struct drr_spill *drrs = &rrd->header.drr_u.drr_spill; dprintf("drr_type = SPILL obj = %llu length = %llu " "err = %d\n", (u_longlong_t)drrs->drr_object, (u_longlong_t)drrs->drr_length, err); break; } case DRR_OBJECT_RANGE: { struct drr_object_range *drror = &rrd->header.drr_u.drr_object_range; dprintf("drr_type = OBJECT_RANGE firstobj = %llu " "numslots = %llu flags = %u err = %d\n", (u_longlong_t)drror->drr_firstobj, (u_longlong_t)drror->drr_numslots, drror->drr_flags, err); break; } default: return; } #endif } /* * Commit the records to the pool. */ static int receive_process_record(struct receive_writer_arg *rwa, struct receive_record_arg *rrd) { int err; /* Processing in order, therefore bytes_read should be increasing. */ ASSERT3U(rrd->bytes_read, >=, rwa->bytes_read); rwa->bytes_read = rrd->bytes_read; /* We can only heal write records; other ones get ignored */ if (rwa->heal && rrd->header.drr_type != DRR_WRITE) { if (rrd->abd != NULL) { abd_free(rrd->abd); rrd->abd = NULL; } else if (rrd->payload != NULL) { kmem_free(rrd->payload, rrd->payload_size); rrd->payload = NULL; } return (0); } if (!rwa->heal && rrd->header.drr_type != DRR_WRITE) { err = flush_write_batch(rwa); if (err != 0) { if (rrd->abd != NULL) { abd_free(rrd->abd); rrd->abd = NULL; rrd->payload = NULL; } else if (rrd->payload != NULL) { kmem_free(rrd->payload, rrd->payload_size); rrd->payload = NULL; } return (err); } } switch (rrd->header.drr_type) { case DRR_OBJECT: { struct drr_object *drro = &rrd->header.drr_u.drr_object; err = receive_object(rwa, drro, rrd->payload); kmem_free(rrd->payload, rrd->payload_size); rrd->payload = NULL; break; } case DRR_FREEOBJECTS: { struct drr_freeobjects *drrfo = &rrd->header.drr_u.drr_freeobjects; err = receive_freeobjects(rwa, drrfo); break; } case DRR_WRITE: { err = receive_process_write_record(rwa, rrd); if (rwa->heal) { /* * If healing - always free the abd after processing */ abd_free(rrd->abd); rrd->abd = NULL; } else if (err != EAGAIN) { /* * On success, a non-healing * receive_process_write_record() returns * EAGAIN to indicate that we do not want to free * the rrd or arc_buf. */ ASSERT(err != 0); abd_free(rrd->abd); rrd->abd = NULL; } break; } case DRR_WRITE_EMBEDDED: { struct drr_write_embedded *drrwe = &rrd->header.drr_u.drr_write_embedded; err = receive_write_embedded(rwa, drrwe, rrd->payload); kmem_free(rrd->payload, rrd->payload_size); rrd->payload = NULL; break; } case DRR_FREE: { struct drr_free *drrf = &rrd->header.drr_u.drr_free; err = receive_free(rwa, drrf); break; } case DRR_SPILL: { struct drr_spill *drrs = &rrd->header.drr_u.drr_spill; err = receive_spill(rwa, drrs, rrd->abd); if (err != 0) abd_free(rrd->abd); rrd->abd = NULL; rrd->payload = NULL; break; } case DRR_OBJECT_RANGE: { struct drr_object_range *drror = &rrd->header.drr_u.drr_object_range; err = receive_object_range(rwa, drror); break; } case DRR_REDACT: { struct drr_redact *drrr = &rrd->header.drr_u.drr_redact; err = receive_redact(rwa, drrr); break; } default: err = (SET_ERROR(EINVAL)); } if (err != 0) dprintf_drr(rrd, err); return (err); } /* * dmu_recv_stream's worker thread; pull records off the queue, and then call * receive_process_record When we're done, signal the main thread and exit. */ static __attribute__((noreturn)) void receive_writer_thread(void *arg) { struct receive_writer_arg *rwa = arg; struct receive_record_arg *rrd; fstrans_cookie_t cookie = spl_fstrans_mark(); for (rrd = bqueue_dequeue(&rwa->q); !rrd->eos_marker; rrd = bqueue_dequeue(&rwa->q)) { /* * If there's an error, the main thread will stop putting things * on the queue, but we need to clear everything in it before we * can exit. */ int err = 0; if (rwa->err == 0) { err = receive_process_record(rwa, rrd); } else if (rrd->abd != NULL) { abd_free(rrd->abd); rrd->abd = NULL; rrd->payload = NULL; } else if (rrd->payload != NULL) { kmem_free(rrd->payload, rrd->payload_size); rrd->payload = NULL; } /* * EAGAIN indicates that this record has been saved (on * raw->write_batch), and will be used again, so we don't * free it. * When healing data we always need to free the record. */ if (err != EAGAIN || rwa->heal) { if (rwa->err == 0) rwa->err = err; kmem_free(rrd, sizeof (*rrd)); } } kmem_free(rrd, sizeof (*rrd)); if (rwa->heal) { zio_wait(rwa->heal_pio); } else { int err = flush_write_batch(rwa); if (rwa->err == 0) rwa->err = err; } mutex_enter(&rwa->mutex); rwa->done = B_TRUE; cv_signal(&rwa->cv); mutex_exit(&rwa->mutex); spl_fstrans_unmark(cookie); thread_exit(); } static int resume_check(dmu_recv_cookie_t *drc, nvlist_t *begin_nvl) { uint64_t val; objset_t *mos = dmu_objset_pool(drc->drc_os)->dp_meta_objset; uint64_t dsobj = dmu_objset_id(drc->drc_os); uint64_t resume_obj, resume_off; if (nvlist_lookup_uint64(begin_nvl, "resume_object", &resume_obj) != 0 || nvlist_lookup_uint64(begin_nvl, "resume_offset", &resume_off) != 0) { return (SET_ERROR(EINVAL)); } VERIFY0(zap_lookup(mos, dsobj, DS_FIELD_RESUME_OBJECT, sizeof (val), 1, &val)); if (resume_obj != val) return (SET_ERROR(EINVAL)); VERIFY0(zap_lookup(mos, dsobj, DS_FIELD_RESUME_OFFSET, sizeof (val), 1, &val)); if (resume_off != val) return (SET_ERROR(EINVAL)); return (0); } /* * Read in the stream's records, one by one, and apply them to the pool. There * are two threads involved; the thread that calls this function will spin up a * worker thread, read the records off the stream one by one, and issue * prefetches for any necessary indirect blocks. It will then push the records * onto an internal blocking queue. The worker thread will pull the records off * the queue, and actually write the data into the DMU. This way, the worker * thread doesn't have to wait for reads to complete, since everything it needs * (the indirect blocks) will be prefetched. * * NB: callers *must* call dmu_recv_end() if this succeeds. */ int dmu_recv_stream(dmu_recv_cookie_t *drc, offset_t *voffp) { int err = 0; struct receive_writer_arg *rwa = kmem_zalloc(sizeof (*rwa), KM_SLEEP); if (dsl_dataset_has_resume_receive_state(drc->drc_ds)) { uint64_t bytes = 0; (void) zap_lookup(drc->drc_ds->ds_dir->dd_pool->dp_meta_objset, drc->drc_ds->ds_object, DS_FIELD_RESUME_BYTES, sizeof (bytes), 1, &bytes); drc->drc_bytes_read += bytes; } drc->drc_ignore_objlist = objlist_create(); /* these were verified in dmu_recv_begin */ ASSERT3U(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo), ==, DMU_SUBSTREAM); ASSERT3U(drc->drc_drrb->drr_type, <, DMU_OST_NUMTYPES); ASSERT(dsl_dataset_phys(drc->drc_ds)->ds_flags & DS_FLAG_INCONSISTENT); ASSERT0(drc->drc_os->os_encrypted && (drc->drc_featureflags & DMU_BACKUP_FEATURE_EMBED_DATA)); /* handle DSL encryption key payload */ if (drc->drc_featureflags & DMU_BACKUP_FEATURE_RAW) { nvlist_t *keynvl = NULL; ASSERT(drc->drc_os->os_encrypted); ASSERT(drc->drc_raw); err = nvlist_lookup_nvlist(drc->drc_begin_nvl, "crypt_keydata", &keynvl); if (err != 0) goto out; if (!drc->drc_heal) { /* * If this is a new dataset we set the key immediately. * Otherwise we don't want to change the key until we * are sure the rest of the receive succeeded so we * stash the keynvl away until then. */ err = dsl_crypto_recv_raw(spa_name(drc->drc_os->os_spa), drc->drc_ds->ds_object, drc->drc_fromsnapobj, drc->drc_drrb->drr_type, keynvl, drc->drc_newfs); if (err != 0) goto out; } /* see comment in dmu_recv_end_sync() */ drc->drc_ivset_guid = 0; (void) nvlist_lookup_uint64(keynvl, "to_ivset_guid", &drc->drc_ivset_guid); if (!drc->drc_newfs) drc->drc_keynvl = fnvlist_dup(keynvl); } if (drc->drc_featureflags & DMU_BACKUP_FEATURE_RESUMING) { err = resume_check(drc, drc->drc_begin_nvl); if (err != 0) goto out; } /* * If we failed before this point we will clean up any new resume * state that was created. Now that we've gotten past the initial * checks we are ok to retain that resume state. */ drc->drc_should_save = B_TRUE; (void) bqueue_init(&rwa->q, zfs_recv_queue_ff, MAX(zfs_recv_queue_length, 2 * zfs_max_recordsize), offsetof(struct receive_record_arg, node)); cv_init(&rwa->cv, NULL, CV_DEFAULT, NULL); mutex_init(&rwa->mutex, NULL, MUTEX_DEFAULT, NULL); rwa->os = drc->drc_os; rwa->byteswap = drc->drc_byteswap; rwa->heal = drc->drc_heal; rwa->tofs = drc->drc_tofs; rwa->resumable = drc->drc_resumable; rwa->raw = drc->drc_raw; rwa->spill = drc->drc_spill; rwa->full = (drc->drc_drr_begin->drr_u.drr_begin.drr_fromguid == 0); rwa->os->os_raw_receive = drc->drc_raw; if (drc->drc_heal) { rwa->heal_pio = zio_root(drc->drc_os->os_spa, NULL, NULL, ZIO_FLAG_GODFATHER); } list_create(&rwa->write_batch, sizeof (struct receive_record_arg), offsetof(struct receive_record_arg, node.bqn_node)); (void) thread_create(NULL, 0, receive_writer_thread, rwa, 0, curproc, TS_RUN, minclsyspri); /* * We're reading rwa->err without locks, which is safe since we are the * only reader, and the worker thread is the only writer. It's ok if we * miss a write for an iteration or two of the loop, since the writer * thread will keep freeing records we send it until we send it an eos * marker. * * We can leave this loop in 3 ways: First, if rwa->err is * non-zero. In that case, the writer thread will free the rrd we just * pushed. Second, if we're interrupted; in that case, either it's the * first loop and drc->drc_rrd was never allocated, or it's later, and * drc->drc_rrd has been handed off to the writer thread who will free * it. Finally, if receive_read_record fails or we're at the end of the * stream, then we free drc->drc_rrd and exit. */ while (rwa->err == 0) { if (issig(JUSTLOOKING) && issig(FORREAL)) { err = SET_ERROR(EINTR); break; } ASSERT3P(drc->drc_rrd, ==, NULL); drc->drc_rrd = drc->drc_next_rrd; drc->drc_next_rrd = NULL; /* Allocates and loads header into drc->drc_next_rrd */ err = receive_read_record(drc); if (drc->drc_rrd->header.drr_type == DRR_END || err != 0) { kmem_free(drc->drc_rrd, sizeof (*drc->drc_rrd)); drc->drc_rrd = NULL; break; } bqueue_enqueue(&rwa->q, drc->drc_rrd, sizeof (struct receive_record_arg) + drc->drc_rrd->payload_size); drc->drc_rrd = NULL; } ASSERT3P(drc->drc_rrd, ==, NULL); drc->drc_rrd = kmem_zalloc(sizeof (*drc->drc_rrd), KM_SLEEP); drc->drc_rrd->eos_marker = B_TRUE; bqueue_enqueue_flush(&rwa->q, drc->drc_rrd, 1); mutex_enter(&rwa->mutex); while (!rwa->done) { /* * We need to use cv_wait_sig() so that any process that may * be sleeping here can still fork. */ (void) cv_wait_sig(&rwa->cv, &rwa->mutex); } mutex_exit(&rwa->mutex); /* * If we are receiving a full stream as a clone, all object IDs which * are greater than the maximum ID referenced in the stream are * by definition unused and must be freed. */ if (drc->drc_clone && drc->drc_drrb->drr_fromguid == 0) { uint64_t obj = rwa->max_object + 1; int free_err = 0; int next_err = 0; while (next_err == 0) { free_err = dmu_free_long_object(rwa->os, obj); if (free_err != 0 && free_err != ENOENT) break; next_err = dmu_object_next(rwa->os, &obj, FALSE, 0); } if (err == 0) { if (free_err != 0 && free_err != ENOENT) err = free_err; else if (next_err != ESRCH) err = next_err; } } cv_destroy(&rwa->cv); mutex_destroy(&rwa->mutex); bqueue_destroy(&rwa->q); list_destroy(&rwa->write_batch); if (err == 0) err = rwa->err; out: /* * If we hit an error before we started the receive_writer_thread * we need to clean up the next_rrd we create by processing the * DRR_BEGIN record. */ if (drc->drc_next_rrd != NULL) kmem_free(drc->drc_next_rrd, sizeof (*drc->drc_next_rrd)); /* * The objset will be invalidated by dmu_recv_end() when we do * dsl_dataset_clone_swap_sync_impl(). */ drc->drc_os = NULL; kmem_free(rwa, sizeof (*rwa)); nvlist_free(drc->drc_begin_nvl); if (err != 0) { /* * Clean up references. If receive is not resumable, * destroy what we created, so we don't leave it in * the inconsistent state. */ dmu_recv_cleanup_ds(drc); nvlist_free(drc->drc_keynvl); } objlist_destroy(drc->drc_ignore_objlist); drc->drc_ignore_objlist = NULL; *voffp = drc->drc_voff; return (err); } static int dmu_recv_end_check(void *arg, dmu_tx_t *tx) { dmu_recv_cookie_t *drc = arg; dsl_pool_t *dp = dmu_tx_pool(tx); int error; ASSERT3P(drc->drc_ds->ds_owner, ==, dmu_recv_tag); if (drc->drc_heal) { error = 0; } else if (!drc->drc_newfs) { dsl_dataset_t *origin_head; error = dsl_dataset_hold(dp, drc->drc_tofs, FTAG, &origin_head); if (error != 0) return (error); if (drc->drc_force) { /* * We will destroy any snapshots in tofs (i.e. before * origin_head) that are after the origin (which is * the snap before drc_ds, because drc_ds can not * have any snaps of its own). */ uint64_t obj; obj = dsl_dataset_phys(origin_head)->ds_prev_snap_obj; while (obj != dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj) { dsl_dataset_t *snap; error = dsl_dataset_hold_obj(dp, obj, FTAG, &snap); if (error != 0) break; if (snap->ds_dir != origin_head->ds_dir) error = SET_ERROR(EINVAL); if (error == 0) { error = dsl_destroy_snapshot_check_impl( snap, B_FALSE); } obj = dsl_dataset_phys(snap)->ds_prev_snap_obj; dsl_dataset_rele(snap, FTAG); if (error != 0) break; } if (error != 0) { dsl_dataset_rele(origin_head, FTAG); return (error); } } if (drc->drc_keynvl != NULL) { error = dsl_crypto_recv_raw_key_check(drc->drc_ds, drc->drc_keynvl, tx); if (error != 0) { dsl_dataset_rele(origin_head, FTAG); return (error); } } error = dsl_dataset_clone_swap_check_impl(drc->drc_ds, origin_head, drc->drc_force, drc->drc_owner, tx); if (error != 0) { dsl_dataset_rele(origin_head, FTAG); return (error); } error = dsl_dataset_snapshot_check_impl(origin_head, drc->drc_tosnap, tx, B_TRUE, 1, drc->drc_cred, drc->drc_proc); dsl_dataset_rele(origin_head, FTAG); if (error != 0) return (error); error = dsl_destroy_head_check_impl(drc->drc_ds, 1); } else { error = dsl_dataset_snapshot_check_impl(drc->drc_ds, drc->drc_tosnap, tx, B_TRUE, 1, drc->drc_cred, drc->drc_proc); } return (error); } static void dmu_recv_end_sync(void *arg, dmu_tx_t *tx) { dmu_recv_cookie_t *drc = arg; dsl_pool_t *dp = dmu_tx_pool(tx); boolean_t encrypted = drc->drc_ds->ds_dir->dd_crypto_obj != 0; uint64_t newsnapobj = 0; spa_history_log_internal_ds(drc->drc_ds, "finish receiving", tx, "snap=%s", drc->drc_tosnap); drc->drc_ds->ds_objset->os_raw_receive = B_FALSE; if (drc->drc_heal) { if (drc->drc_keynvl != NULL) { nvlist_free(drc->drc_keynvl); drc->drc_keynvl = NULL; } } else if (!drc->drc_newfs) { dsl_dataset_t *origin_head; VERIFY0(dsl_dataset_hold(dp, drc->drc_tofs, FTAG, &origin_head)); if (drc->drc_force) { /* * Destroy any snapshots of drc_tofs (origin_head) * after the origin (the snap before drc_ds). */ uint64_t obj; obj = dsl_dataset_phys(origin_head)->ds_prev_snap_obj; while (obj != dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj) { dsl_dataset_t *snap; VERIFY0(dsl_dataset_hold_obj(dp, obj, FTAG, &snap)); ASSERT3P(snap->ds_dir, ==, origin_head->ds_dir); obj = dsl_dataset_phys(snap)->ds_prev_snap_obj; dsl_destroy_snapshot_sync_impl(snap, B_FALSE, tx); dsl_dataset_rele(snap, FTAG); } } if (drc->drc_keynvl != NULL) { dsl_crypto_recv_raw_key_sync(drc->drc_ds, drc->drc_keynvl, tx); nvlist_free(drc->drc_keynvl); drc->drc_keynvl = NULL; } VERIFY3P(drc->drc_ds->ds_prev, ==, origin_head->ds_prev); dsl_dataset_clone_swap_sync_impl(drc->drc_ds, origin_head, tx); /* * The objset was evicted by dsl_dataset_clone_swap_sync_impl, * so drc_os is no longer valid. */ drc->drc_os = NULL; dsl_dataset_snapshot_sync_impl(origin_head, drc->drc_tosnap, tx); /* set snapshot's creation time and guid */ dmu_buf_will_dirty(origin_head->ds_prev->ds_dbuf, tx); dsl_dataset_phys(origin_head->ds_prev)->ds_creation_time = drc->drc_drrb->drr_creation_time; dsl_dataset_phys(origin_head->ds_prev)->ds_guid = drc->drc_drrb->drr_toguid; dsl_dataset_phys(origin_head->ds_prev)->ds_flags &= ~DS_FLAG_INCONSISTENT; dmu_buf_will_dirty(origin_head->ds_dbuf, tx); dsl_dataset_phys(origin_head)->ds_flags &= ~DS_FLAG_INCONSISTENT; newsnapobj = dsl_dataset_phys(origin_head)->ds_prev_snap_obj; dsl_dataset_rele(origin_head, FTAG); dsl_destroy_head_sync_impl(drc->drc_ds, tx); if (drc->drc_owner != NULL) VERIFY3P(origin_head->ds_owner, ==, drc->drc_owner); } else { dsl_dataset_t *ds = drc->drc_ds; dsl_dataset_snapshot_sync_impl(ds, drc->drc_tosnap, tx); /* set snapshot's creation time and guid */ dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); dsl_dataset_phys(ds->ds_prev)->ds_creation_time = drc->drc_drrb->drr_creation_time; dsl_dataset_phys(ds->ds_prev)->ds_guid = drc->drc_drrb->drr_toguid; dsl_dataset_phys(ds->ds_prev)->ds_flags &= ~DS_FLAG_INCONSISTENT; dmu_buf_will_dirty(ds->ds_dbuf, tx); dsl_dataset_phys(ds)->ds_flags &= ~DS_FLAG_INCONSISTENT; if (dsl_dataset_has_resume_receive_state(ds)) { (void) zap_remove(dp->dp_meta_objset, ds->ds_object, DS_FIELD_RESUME_FROMGUID, tx); (void) zap_remove(dp->dp_meta_objset, ds->ds_object, DS_FIELD_RESUME_OBJECT, tx); (void) zap_remove(dp->dp_meta_objset, ds->ds_object, DS_FIELD_RESUME_OFFSET, tx); (void) zap_remove(dp->dp_meta_objset, ds->ds_object, DS_FIELD_RESUME_BYTES, tx); (void) zap_remove(dp->dp_meta_objset, ds->ds_object, DS_FIELD_RESUME_TOGUID, tx); (void) zap_remove(dp->dp_meta_objset, ds->ds_object, DS_FIELD_RESUME_TONAME, tx); (void) zap_remove(dp->dp_meta_objset, ds->ds_object, DS_FIELD_RESUME_REDACT_BOOKMARK_SNAPS, tx); } newsnapobj = dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj; } /* * If this is a raw receive, the crypt_keydata nvlist will include * a to_ivset_guid for us to set on the new snapshot. This value * will override the value generated by the snapshot code. However, * this value may not be present, because older implementations of * the raw send code did not include this value, and we are still * allowed to receive them if the zfs_disable_ivset_guid_check * tunable is set, in which case we will leave the newly-generated * value. */ if (!drc->drc_heal && drc->drc_raw && drc->drc_ivset_guid != 0) { dmu_object_zapify(dp->dp_meta_objset, newsnapobj, DMU_OT_DSL_DATASET, tx); VERIFY0(zap_update(dp->dp_meta_objset, newsnapobj, DS_FIELD_IVSET_GUID, sizeof (uint64_t), 1, &drc->drc_ivset_guid, tx)); } /* * Release the hold from dmu_recv_begin. This must be done before * we return to open context, so that when we free the dataset's dnode * we can evict its bonus buffer. Since the dataset may be destroyed * at this point (and therefore won't have a valid pointer to the spa) * we release the key mapping manually here while we do have a valid * pointer, if it exists. */ if (!drc->drc_raw && encrypted) { (void) spa_keystore_remove_mapping(dmu_tx_pool(tx)->dp_spa, drc->drc_ds->ds_object, drc->drc_ds); } dsl_dataset_disown(drc->drc_ds, 0, dmu_recv_tag); drc->drc_ds = NULL; } static int dmu_recv_end_modified_blocks = 3; static int dmu_recv_existing_end(dmu_recv_cookie_t *drc) { #ifdef _KERNEL /* * We will be destroying the ds; make sure its origin is unmounted if * necessary. */ char name[ZFS_MAX_DATASET_NAME_LEN]; dsl_dataset_name(drc->drc_ds, name); zfs_destroy_unmount_origin(name); #endif return (dsl_sync_task(drc->drc_tofs, dmu_recv_end_check, dmu_recv_end_sync, drc, dmu_recv_end_modified_blocks, ZFS_SPACE_CHECK_NORMAL)); } static int dmu_recv_new_end(dmu_recv_cookie_t *drc) { return (dsl_sync_task(drc->drc_tofs, dmu_recv_end_check, dmu_recv_end_sync, drc, dmu_recv_end_modified_blocks, ZFS_SPACE_CHECK_NORMAL)); } int dmu_recv_end(dmu_recv_cookie_t *drc, void *owner) { int error; drc->drc_owner = owner; if (drc->drc_newfs) error = dmu_recv_new_end(drc); else error = dmu_recv_existing_end(drc); if (error != 0) { dmu_recv_cleanup_ds(drc); nvlist_free(drc->drc_keynvl); } else if (!drc->drc_heal) { if (drc->drc_newfs) { zvol_create_minor(drc->drc_tofs); } char *snapname = kmem_asprintf("%s@%s", drc->drc_tofs, drc->drc_tosnap); zvol_create_minor(snapname); kmem_strfree(snapname); } return (error); } /* * Return TRUE if this objset is currently being received into. */ boolean_t dmu_objset_is_receiving(objset_t *os) { return (os->os_dsl_dataset != NULL && os->os_dsl_dataset->ds_owner == dmu_recv_tag); } ZFS_MODULE_PARAM(zfs_recv, zfs_recv_, queue_length, UINT, ZMOD_RW, "Maximum receive queue length"); ZFS_MODULE_PARAM(zfs_recv, zfs_recv_, queue_ff, UINT, ZMOD_RW, "Receive queue fill fraction"); ZFS_MODULE_PARAM(zfs_recv, zfs_recv_, write_batch_size, UINT, ZMOD_RW, "Maximum amount of writes to batch into one transaction"); ZFS_MODULE_PARAM(zfs_recv, zfs_recv_, best_effort_corrective, INT, ZMOD_RW, "Ignore errors during corrective receive"); /* END CSTYLED */