diff --git a/cmd/raidz_test/raidz_test.c b/cmd/raidz_test/raidz_test.c index 1ece55960d33..34f3f6f1ccc6 100644 --- a/cmd/raidz_test/raidz_test.c +++ b/cmd/raidz_test/raidz_test.c @@ -1,1026 +1,1024 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (C) 2016 Gvozden Nešković. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include "raidz_test.h" static int *rand_data; raidz_test_opts_t rto_opts; static char pid_s[16]; static void sig_handler(int signo) { int old_errno = errno; struct sigaction action; /* * Restore default action and re-raise signal so SIGSEGV and * SIGABRT can trigger a core dump. */ action.sa_handler = SIG_DFL; sigemptyset(&action.sa_mask); action.sa_flags = 0; (void) sigaction(signo, &action, NULL); if (rto_opts.rto_gdb) { pid_t pid = fork(); if (pid == 0) { execlp("gdb", "gdb", "-ex", "set pagination 0", "-p", pid_s, NULL); _exit(-1); } else if (pid > 0) while (waitpid(pid, NULL, 0) == -1 && errno == EINTR) ; } raise(signo); errno = old_errno; } static void print_opts(raidz_test_opts_t *opts, boolean_t force) { const char *verbose; switch (opts->rto_v) { case D_ALL: verbose = "no"; break; case D_INFO: verbose = "info"; break; case D_DEBUG: default: verbose = "debug"; break; } if (force || opts->rto_v >= D_INFO) { (void) fprintf(stdout, DBLSEP "Running with options:\n" " (-a) zio ashift : %zu\n" " (-o) zio offset : 1 << %zu\n" " (-e) expanded map : %s\n" " (-r) reflow offset : %llx\n" " (-d) number of raidz data columns : %zu\n" " (-s) size of DATA : 1 << %zu\n" " (-S) sweep parameters : %s \n" " (-v) verbose : %s \n\n", opts->rto_ashift, /* -a */ ilog2(opts->rto_offset), /* -o */ opts->rto_expand ? "yes" : "no", /* -e */ (u_longlong_t)opts->rto_expand_offset, /* -r */ opts->rto_dcols, /* -d */ ilog2(opts->rto_dsize), /* -s */ opts->rto_sweep ? "yes" : "no", /* -S */ verbose); /* -v */ } } static void usage(boolean_t requested) { const raidz_test_opts_t *o = &rto_opts_defaults; FILE *fp = requested ? stdout : stderr; (void) fprintf(fp, "Usage:\n" "\t[-a zio ashift (default: %zu)]\n" "\t[-o zio offset, exponent radix 2 (default: %zu)]\n" "\t[-d number of raidz data columns (default: %zu)]\n" "\t[-s zio size, exponent radix 2 (default: %zu)]\n" "\t[-S parameter sweep (default: %s)]\n" "\t[-t timeout for parameter sweep test]\n" "\t[-B benchmark all raidz implementations]\n" "\t[-e use expanded raidz map (default: %s)]\n" "\t[-r expanded raidz map reflow offset (default: %llx)]\n" "\t[-v increase verbosity (default: %d)]\n" "\t[-h (print help)]\n" "\t[-T test the test, see if failure would be detected]\n" "\t[-D debug (attach gdb on SIGSEGV)]\n" "", o->rto_ashift, /* -a */ ilog2(o->rto_offset), /* -o */ o->rto_dcols, /* -d */ ilog2(o->rto_dsize), /* -s */ rto_opts.rto_sweep ? "yes" : "no", /* -S */ rto_opts.rto_expand ? "yes" : "no", /* -e */ (u_longlong_t)o->rto_expand_offset, /* -r */ o->rto_v); /* -v */ exit(requested ? 0 : 1); } static void process_options(int argc, char **argv) { size_t value; int opt; raidz_test_opts_t *o = &rto_opts; memcpy(o, &rto_opts_defaults, sizeof (*o)); while ((opt = getopt(argc, argv, "TDBSvha:er:o:d:s:t:")) != -1) { - value = 0; - switch (opt) { case 'a': value = strtoull(optarg, NULL, 0); o->rto_ashift = MIN(13, MAX(9, value)); break; case 'e': o->rto_expand = 1; break; case 'r': o->rto_expand_offset = strtoull(optarg, NULL, 0); break; case 'o': value = strtoull(optarg, NULL, 0); o->rto_offset = ((1ULL << MIN(12, value)) >> 9) << 9; break; case 'd': value = strtoull(optarg, NULL, 0); o->rto_dcols = MIN(255, MAX(1, value)); break; case 's': value = strtoull(optarg, NULL, 0); o->rto_dsize = 1ULL << MIN(SPA_MAXBLOCKSHIFT, MAX(SPA_MINBLOCKSHIFT, value)); break; case 't': value = strtoull(optarg, NULL, 0); o->rto_sweep_timeout = value; break; case 'v': o->rto_v++; break; case 'S': o->rto_sweep = 1; break; case 'B': o->rto_benchmark = 1; break; case 'D': o->rto_gdb = 1; break; case 'T': o->rto_sanity = 1; break; case 'h': usage(B_TRUE); break; case '?': default: usage(B_FALSE); break; } } } #define DATA_COL(rr, i) ((rr)->rr_col[rr->rr_firstdatacol + (i)].rc_abd) #define DATA_COL_SIZE(rr, i) ((rr)->rr_col[rr->rr_firstdatacol + (i)].rc_size) #define CODE_COL(rr, i) ((rr)->rr_col[(i)].rc_abd) #define CODE_COL_SIZE(rr, i) ((rr)->rr_col[(i)].rc_size) static int cmp_code(raidz_test_opts_t *opts, const raidz_map_t *rm, const int parity) { int r, i, ret = 0; VERIFY(parity >= 1 && parity <= 3); for (r = 0; r < rm->rm_nrows; r++) { raidz_row_t * const rr = rm->rm_row[r]; raidz_row_t * const rrg = opts->rm_golden->rm_row[r]; for (i = 0; i < parity; i++) { if (CODE_COL_SIZE(rrg, i) == 0) { VERIFY0(CODE_COL_SIZE(rr, i)); continue; } if (abd_cmp(CODE_COL(rr, i), CODE_COL(rrg, i)) != 0) { ret++; LOG_OPT(D_DEBUG, opts, "\nParity block [%d] different!\n", i); } } } return (ret); } static int cmp_data(raidz_test_opts_t *opts, raidz_map_t *rm) { int r, i, dcols, ret = 0; for (r = 0; r < rm->rm_nrows; r++) { raidz_row_t *rr = rm->rm_row[r]; raidz_row_t *rrg = opts->rm_golden->rm_row[r]; dcols = opts->rm_golden->rm_row[0]->rr_cols - raidz_parity(opts->rm_golden); for (i = 0; i < dcols; i++) { if (DATA_COL_SIZE(rrg, i) == 0) { VERIFY0(DATA_COL_SIZE(rr, i)); continue; } if (abd_cmp(DATA_COL(rrg, i), DATA_COL(rr, i)) != 0) { ret++; LOG_OPT(D_DEBUG, opts, "\nData block [%d] different!\n", i); } } } return (ret); } static int init_rand(void *data, size_t size, void *private) { (void) private; memcpy(data, rand_data, size); return (0); } static void corrupt_colums(raidz_map_t *rm, const int *tgts, const int cnt) { for (int r = 0; r < rm->rm_nrows; r++) { raidz_row_t *rr = rm->rm_row[r]; for (int i = 0; i < cnt; i++) { raidz_col_t *col = &rr->rr_col[tgts[i]]; abd_iterate_func(col->rc_abd, 0, col->rc_size, init_rand, NULL); } } } void init_zio_abd(zio_t *zio) { abd_iterate_func(zio->io_abd, 0, zio->io_size, init_rand, NULL); } static void fini_raidz_map(zio_t **zio, raidz_map_t **rm) { vdev_raidz_map_free(*rm); raidz_free((*zio)->io_abd, (*zio)->io_size); umem_free(*zio, sizeof (zio_t)); *zio = NULL; *rm = NULL; } static int init_raidz_golden_map(raidz_test_opts_t *opts, const int parity) { int err = 0; zio_t *zio_test; raidz_map_t *rm_test; const size_t total_ncols = opts->rto_dcols + parity; if (opts->rm_golden) { fini_raidz_map(&opts->zio_golden, &opts->rm_golden); } opts->zio_golden = umem_zalloc(sizeof (zio_t), UMEM_NOFAIL); zio_test = umem_zalloc(sizeof (zio_t), UMEM_NOFAIL); opts->zio_golden->io_offset = zio_test->io_offset = opts->rto_offset; opts->zio_golden->io_size = zio_test->io_size = opts->rto_dsize; opts->zio_golden->io_abd = raidz_alloc(opts->rto_dsize); zio_test->io_abd = raidz_alloc(opts->rto_dsize); init_zio_abd(opts->zio_golden); init_zio_abd(zio_test); VERIFY0(vdev_raidz_impl_set("original")); if (opts->rto_expand) { opts->rm_golden = vdev_raidz_map_alloc_expanded(opts->zio_golden->io_abd, opts->zio_golden->io_size, opts->zio_golden->io_offset, opts->rto_ashift, total_ncols+1, total_ncols, parity, opts->rto_expand_offset); rm_test = vdev_raidz_map_alloc_expanded(zio_test->io_abd, zio_test->io_size, zio_test->io_offset, opts->rto_ashift, total_ncols+1, total_ncols, parity, opts->rto_expand_offset); } else { opts->rm_golden = vdev_raidz_map_alloc(opts->zio_golden, opts->rto_ashift, total_ncols, parity); rm_test = vdev_raidz_map_alloc(zio_test, opts->rto_ashift, total_ncols, parity); } VERIFY(opts->zio_golden); VERIFY(opts->rm_golden); vdev_raidz_generate_parity(opts->rm_golden); vdev_raidz_generate_parity(rm_test); /* sanity check */ err |= cmp_data(opts, rm_test); err |= cmp_code(opts, rm_test, parity); if (err) ERR("initializing the golden copy ... [FAIL]!\n"); /* tear down raidz_map of test zio */ fini_raidz_map(&zio_test, &rm_test); return (err); } /* * If reflow is not in progress, reflow_offset should be UINT64_MAX. * For each row, if the row is entirely before reflow_offset, it will * come from the new location. Otherwise this row will come from the * old location. Therefore, rows that straddle the reflow_offset will * come from the old location. * * NOTE: Until raidz expansion is implemented this function is only * needed by raidz_test.c to the multi-row raid_map_t functionality. */ raidz_map_t * vdev_raidz_map_alloc_expanded(abd_t *abd, uint64_t size, uint64_t offset, uint64_t ashift, uint64_t physical_cols, uint64_t logical_cols, uint64_t nparity, uint64_t reflow_offset) { /* The zio's size in units of the vdev's minimum sector size. */ uint64_t s = size >> ashift; uint64_t q, r, bc, devidx, asize = 0, tot; /* * "Quotient": The number of data sectors for this stripe on all but * the "big column" child vdevs that also contain "remainder" data. * AKA "full rows" */ q = s / (logical_cols - nparity); /* * "Remainder": The number of partial stripe data sectors in this I/O. * This will add a sector to some, but not all, child vdevs. */ r = s - q * (logical_cols - nparity); /* The number of "big columns" - those which contain remainder data. */ bc = (r == 0 ? 0 : r + nparity); /* * The total number of data and parity sectors associated with * this I/O. */ tot = s + nparity * (q + (r == 0 ? 0 : 1)); /* How many rows contain data (not skip) */ uint64_t rows = howmany(tot, logical_cols); int cols = MIN(tot, logical_cols); raidz_map_t *rm = kmem_zalloc(offsetof(raidz_map_t, rm_row[rows]), KM_SLEEP); rm->rm_nrows = rows; for (uint64_t row = 0; row < rows; row++) { raidz_row_t *rr = kmem_alloc(offsetof(raidz_row_t, rr_col[cols]), KM_SLEEP); rm->rm_row[row] = rr; /* The starting RAIDZ (parent) vdev sector of the row. */ uint64_t b = (offset >> ashift) + row * logical_cols; /* * If we are in the middle of a reflow, and any part of this * row has not been copied, then use the old location of * this row. */ int row_phys_cols = physical_cols; if (b + (logical_cols - nparity) > reflow_offset >> ashift) row_phys_cols--; /* starting child of this row */ uint64_t child_id = b % row_phys_cols; /* The starting byte offset on each child vdev. */ uint64_t child_offset = (b / row_phys_cols) << ashift; /* * We set cols to the entire width of the block, even * if this row is shorter. This is needed because parity * generation (for Q and R) needs to know the entire width, * because it treats the short row as though it was * full-width (and the "phantom" sectors were zero-filled). * * Another approach to this would be to set cols shorter * (to just the number of columns that we might do i/o to) * and have another mechanism to tell the parity generation * about the "entire width". Reconstruction (at least * vdev_raidz_reconstruct_general()) would also need to * know about the "entire width". */ rr->rr_cols = cols; rr->rr_bigcols = bc; rr->rr_missingdata = 0; rr->rr_missingparity = 0; rr->rr_firstdatacol = nparity; rr->rr_abd_empty = NULL; rr->rr_nempty = 0; for (int c = 0; c < rr->rr_cols; c++, child_id++) { if (child_id >= row_phys_cols) { child_id -= row_phys_cols; child_offset += 1ULL << ashift; } rr->rr_col[c].rc_devidx = child_id; rr->rr_col[c].rc_offset = child_offset; rr->rr_col[c].rc_orig_data = NULL; rr->rr_col[c].rc_error = 0; rr->rr_col[c].rc_tried = 0; rr->rr_col[c].rc_skipped = 0; rr->rr_col[c].rc_need_orig_restore = B_FALSE; uint64_t dc = c - rr->rr_firstdatacol; if (c < rr->rr_firstdatacol) { rr->rr_col[c].rc_size = 1ULL << ashift; rr->rr_col[c].rc_abd = abd_alloc_linear(rr->rr_col[c].rc_size, B_TRUE); } else if (row == rows - 1 && bc != 0 && c >= bc) { /* * Past the end, this for parity generation. */ rr->rr_col[c].rc_size = 0; rr->rr_col[c].rc_abd = NULL; } else { /* * "data column" (col excluding parity) * Add an ASCII art diagram here */ uint64_t off; if (c < bc || r == 0) { off = dc * rows + row; } else { off = r * rows + (dc - r) * (rows - 1) + row; } rr->rr_col[c].rc_size = 1ULL << ashift; rr->rr_col[c].rc_abd = abd_get_offset_struct( &rr->rr_col[c].rc_abdstruct, abd, off << ashift, 1 << ashift); } asize += rr->rr_col[c].rc_size; } /* * If all data stored spans all columns, there's a danger that * parity will always be on the same device and, since parity * isn't read during normal operation, that that device's I/O * bandwidth won't be used effectively. We therefore switch * the parity every 1MB. * * ...at least that was, ostensibly, the theory. As a practical * matter unless we juggle the parity between all devices * evenly, we won't see any benefit. Further, occasional writes * that aren't a multiple of the LCM of the number of children * and the minimum stripe width are sufficient to avoid pessimal * behavior. Unfortunately, this decision created an implicit * on-disk format requirement that we need to support for all * eternity, but only for single-parity RAID-Z. * * If we intend to skip a sector in the zeroth column for * padding we must make sure to note this swap. We will never * intend to skip the first column since at least one data and * one parity column must appear in each row. */ if (rr->rr_firstdatacol == 1 && rr->rr_cols > 1 && (offset & (1ULL << 20))) { ASSERT(rr->rr_cols >= 2); ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size); devidx = rr->rr_col[0].rc_devidx; uint64_t o = rr->rr_col[0].rc_offset; rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx; rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset; rr->rr_col[1].rc_devidx = devidx; rr->rr_col[1].rc_offset = o; } } ASSERT3U(asize, ==, tot << ashift); /* init RAIDZ parity ops */ rm->rm_ops = vdev_raidz_math_get_ops(); return (rm); } static raidz_map_t * init_raidz_map(raidz_test_opts_t *opts, zio_t **zio, const int parity) { raidz_map_t *rm = NULL; const size_t alloc_dsize = opts->rto_dsize; const size_t total_ncols = opts->rto_dcols + parity; const int ccols[] = { 0, 1, 2 }; VERIFY(zio); VERIFY(parity <= 3 && parity >= 1); *zio = umem_zalloc(sizeof (zio_t), UMEM_NOFAIL); (*zio)->io_offset = 0; (*zio)->io_size = alloc_dsize; (*zio)->io_abd = raidz_alloc(alloc_dsize); init_zio_abd(*zio); if (opts->rto_expand) { rm = vdev_raidz_map_alloc_expanded((*zio)->io_abd, (*zio)->io_size, (*zio)->io_offset, opts->rto_ashift, total_ncols+1, total_ncols, parity, opts->rto_expand_offset); } else { rm = vdev_raidz_map_alloc(*zio, opts->rto_ashift, total_ncols, parity); } VERIFY(rm); /* Make sure code columns are destroyed */ corrupt_colums(rm, ccols, parity); return (rm); } static int run_gen_check(raidz_test_opts_t *opts) { char **impl_name; int fn, err = 0; zio_t *zio_test; raidz_map_t *rm_test; err = init_raidz_golden_map(opts, PARITY_PQR); if (0 != err) return (err); LOG(D_INFO, DBLSEP); LOG(D_INFO, "Testing parity generation...\n"); for (impl_name = (char **)raidz_impl_names+1; *impl_name != NULL; impl_name++) { LOG(D_INFO, SEP); LOG(D_INFO, "\tTesting [%s] implementation...", *impl_name); if (0 != vdev_raidz_impl_set(*impl_name)) { LOG(D_INFO, "[SKIP]\n"); continue; } else { LOG(D_INFO, "[SUPPORTED]\n"); } for (fn = 0; fn < RAIDZ_GEN_NUM; fn++) { /* Check if should stop */ if (rto_opts.rto_should_stop) return (err); /* create suitable raidz_map */ rm_test = init_raidz_map(opts, &zio_test, fn+1); VERIFY(rm_test); LOG(D_INFO, "\t\tTesting method [%s] ...", raidz_gen_name[fn]); if (!opts->rto_sanity) vdev_raidz_generate_parity(rm_test); if (cmp_code(opts, rm_test, fn+1) != 0) { LOG(D_INFO, "[FAIL]\n"); err++; } else LOG(D_INFO, "[PASS]\n"); fini_raidz_map(&zio_test, &rm_test); } } fini_raidz_map(&opts->zio_golden, &opts->rm_golden); return (err); } static int run_rec_check_impl(raidz_test_opts_t *opts, raidz_map_t *rm, const int fn) { int x0, x1, x2; int tgtidx[3]; int err = 0; static const int rec_tgts[7][3] = { {1, 2, 3}, /* rec_p: bad QR & D[0] */ {0, 2, 3}, /* rec_q: bad PR & D[0] */ {0, 1, 3}, /* rec_r: bad PQ & D[0] */ {2, 3, 4}, /* rec_pq: bad R & D[0][1] */ {1, 3, 4}, /* rec_pr: bad Q & D[0][1] */ {0, 3, 4}, /* rec_qr: bad P & D[0][1] */ {3, 4, 5} /* rec_pqr: bad & D[0][1][2] */ }; memcpy(tgtidx, rec_tgts[fn], sizeof (tgtidx)); if (fn < RAIDZ_REC_PQ) { /* can reconstruct 1 failed data disk */ for (x0 = 0; x0 < opts->rto_dcols; x0++) { if (x0 >= rm->rm_row[0]->rr_cols - raidz_parity(rm)) continue; /* Check if should stop */ if (rto_opts.rto_should_stop) return (err); LOG(D_DEBUG, "[%d] ", x0); tgtidx[2] = x0 + raidz_parity(rm); corrupt_colums(rm, tgtidx+2, 1); if (!opts->rto_sanity) vdev_raidz_reconstruct(rm, tgtidx, 3); if (cmp_data(opts, rm) != 0) { err++; LOG(D_DEBUG, "\nREC D[%d]... [FAIL]\n", x0); } } } else if (fn < RAIDZ_REC_PQR) { /* can reconstruct 2 failed data disk */ for (x0 = 0; x0 < opts->rto_dcols; x0++) { if (x0 >= rm->rm_row[0]->rr_cols - raidz_parity(rm)) continue; for (x1 = x0 + 1; x1 < opts->rto_dcols; x1++) { if (x1 >= rm->rm_row[0]->rr_cols - raidz_parity(rm)) continue; /* Check if should stop */ if (rto_opts.rto_should_stop) return (err); LOG(D_DEBUG, "[%d %d] ", x0, x1); tgtidx[1] = x0 + raidz_parity(rm); tgtidx[2] = x1 + raidz_parity(rm); corrupt_colums(rm, tgtidx+1, 2); if (!opts->rto_sanity) vdev_raidz_reconstruct(rm, tgtidx, 3); if (cmp_data(opts, rm) != 0) { err++; LOG(D_DEBUG, "\nREC D[%d %d]... " "[FAIL]\n", x0, x1); } } } } else { /* can reconstruct 3 failed data disk */ for (x0 = 0; x0 < opts->rto_dcols; x0++) { if (x0 >= rm->rm_row[0]->rr_cols - raidz_parity(rm)) continue; for (x1 = x0 + 1; x1 < opts->rto_dcols; x1++) { if (x1 >= rm->rm_row[0]->rr_cols - raidz_parity(rm)) continue; for (x2 = x1 + 1; x2 < opts->rto_dcols; x2++) { if (x2 >= rm->rm_row[0]->rr_cols - raidz_parity(rm)) continue; /* Check if should stop */ if (rto_opts.rto_should_stop) return (err); LOG(D_DEBUG, "[%d %d %d]", x0, x1, x2); tgtidx[0] = x0 + raidz_parity(rm); tgtidx[1] = x1 + raidz_parity(rm); tgtidx[2] = x2 + raidz_parity(rm); corrupt_colums(rm, tgtidx, 3); if (!opts->rto_sanity) vdev_raidz_reconstruct(rm, tgtidx, 3); if (cmp_data(opts, rm) != 0) { err++; LOG(D_DEBUG, "\nREC D[%d %d %d]... " "[FAIL]\n", x0, x1, x2); } } } } } return (err); } static int run_rec_check(raidz_test_opts_t *opts) { char **impl_name; unsigned fn, err = 0; zio_t *zio_test; raidz_map_t *rm_test; err = init_raidz_golden_map(opts, PARITY_PQR); if (0 != err) return (err); LOG(D_INFO, DBLSEP); LOG(D_INFO, "Testing data reconstruction...\n"); for (impl_name = (char **)raidz_impl_names+1; *impl_name != NULL; impl_name++) { LOG(D_INFO, SEP); LOG(D_INFO, "\tTesting [%s] implementation...", *impl_name); if (vdev_raidz_impl_set(*impl_name) != 0) { LOG(D_INFO, "[SKIP]\n"); continue; } else LOG(D_INFO, "[SUPPORTED]\n"); /* create suitable raidz_map */ rm_test = init_raidz_map(opts, &zio_test, PARITY_PQR); /* generate parity */ vdev_raidz_generate_parity(rm_test); for (fn = 0; fn < RAIDZ_REC_NUM; fn++) { LOG(D_INFO, "\t\tTesting method [%s] ...", raidz_rec_name[fn]); if (run_rec_check_impl(opts, rm_test, fn) != 0) { LOG(D_INFO, "[FAIL]\n"); err++; } else LOG(D_INFO, "[PASS]\n"); } /* tear down test raidz_map */ fini_raidz_map(&zio_test, &rm_test); } fini_raidz_map(&opts->zio_golden, &opts->rm_golden); return (err); } static int run_test(raidz_test_opts_t *opts) { int err = 0; if (opts == NULL) opts = &rto_opts; print_opts(opts, B_FALSE); err |= run_gen_check(opts); err |= run_rec_check(opts); return (err); } #define SWEEP_RUNNING 0 #define SWEEP_FINISHED 1 #define SWEEP_ERROR 2 #define SWEEP_TIMEOUT 3 static int sweep_state = 0; static raidz_test_opts_t failed_opts; static kmutex_t sem_mtx; static kcondvar_t sem_cv; static int max_free_slots; static int free_slots; static __attribute__((noreturn)) void sweep_thread(void *arg) { int err = 0; raidz_test_opts_t *opts = (raidz_test_opts_t *)arg; VERIFY(opts != NULL); err = run_test(opts); if (rto_opts.rto_sanity) { /* 25% chance that a sweep test fails */ if (rand() < (RAND_MAX/4)) err = 1; } if (0 != err) { mutex_enter(&sem_mtx); memcpy(&failed_opts, opts, sizeof (raidz_test_opts_t)); sweep_state = SWEEP_ERROR; mutex_exit(&sem_mtx); } umem_free(opts, sizeof (raidz_test_opts_t)); /* signal the next thread */ mutex_enter(&sem_mtx); free_slots++; cv_signal(&sem_cv); mutex_exit(&sem_mtx); thread_exit(); } static int run_sweep(void) { static const size_t dcols_v[] = { 1, 2, 3, 4, 5, 6, 7, 8, 12, 15, 16 }; static const size_t ashift_v[] = { 9, 12, 14 }; static const size_t size_v[] = { 1 << 9, 21 * (1 << 9), 13 * (1 << 12), 1 << 17, (1 << 20) - (1 << 12), SPA_MAXBLOCKSIZE }; (void) setvbuf(stdout, NULL, _IONBF, 0); ulong_t total_comb = ARRAY_SIZE(size_v) * ARRAY_SIZE(ashift_v) * ARRAY_SIZE(dcols_v); ulong_t tried_comb = 0; hrtime_t time_diff, start_time = gethrtime(); raidz_test_opts_t *opts; int a, d, s; max_free_slots = free_slots = MAX(2, boot_ncpus); mutex_init(&sem_mtx, NULL, MUTEX_DEFAULT, NULL); cv_init(&sem_cv, NULL, CV_DEFAULT, NULL); for (s = 0; s < ARRAY_SIZE(size_v); s++) for (a = 0; a < ARRAY_SIZE(ashift_v); a++) for (d = 0; d < ARRAY_SIZE(dcols_v); d++) { if (size_v[s] < (1 << ashift_v[a])) { total_comb--; continue; } if (++tried_comb % 20 == 0) LOG(D_ALL, "%lu/%lu... ", tried_comb, total_comb); /* wait for signal to start new thread */ mutex_enter(&sem_mtx); while (cv_timedwait_sig(&sem_cv, &sem_mtx, ddi_get_lbolt() + hz)) { /* check if should stop the test (timeout) */ time_diff = (gethrtime() - start_time) / NANOSEC; if (rto_opts.rto_sweep_timeout > 0 && time_diff >= rto_opts.rto_sweep_timeout) { sweep_state = SWEEP_TIMEOUT; rto_opts.rto_should_stop = B_TRUE; mutex_exit(&sem_mtx); goto exit; } /* check if should stop the test (error) */ if (sweep_state != SWEEP_RUNNING) { mutex_exit(&sem_mtx); goto exit; } /* exit loop if a slot is available */ if (free_slots > 0) { break; } } free_slots--; mutex_exit(&sem_mtx); opts = umem_zalloc(sizeof (raidz_test_opts_t), UMEM_NOFAIL); opts->rto_ashift = ashift_v[a]; opts->rto_dcols = dcols_v[d]; opts->rto_offset = (1 << ashift_v[a]) * rand(); opts->rto_dsize = size_v[s]; opts->rto_expand = rto_opts.rto_expand; opts->rto_expand_offset = rto_opts.rto_expand_offset; opts->rto_v = 0; /* be quiet */ VERIFY3P(thread_create(NULL, 0, sweep_thread, (void *) opts, 0, NULL, TS_RUN, defclsyspri), !=, NULL); } exit: LOG(D_ALL, "\nWaiting for test threads to finish...\n"); mutex_enter(&sem_mtx); VERIFY(free_slots <= max_free_slots); while (free_slots < max_free_slots) { (void) cv_wait(&sem_cv, &sem_mtx); } mutex_exit(&sem_mtx); if (sweep_state == SWEEP_ERROR) { ERR("Sweep test failed! Failed option: \n"); print_opts(&failed_opts, B_TRUE); } else { if (sweep_state == SWEEP_TIMEOUT) LOG(D_ALL, "Test timeout (%lus). Stopping...\n", (ulong_t)rto_opts.rto_sweep_timeout); LOG(D_ALL, "Sweep test succeeded on %lu raidz maps!\n", (ulong_t)tried_comb); } mutex_destroy(&sem_mtx); return (sweep_state == SWEEP_ERROR ? SWEEP_ERROR : 0); } int main(int argc, char **argv) { size_t i; struct sigaction action; int err = 0; /* init gdb pid string early */ (void) sprintf(pid_s, "%d", getpid()); action.sa_handler = sig_handler; sigemptyset(&action.sa_mask); action.sa_flags = 0; if (sigaction(SIGSEGV, &action, NULL) < 0) { ERR("raidz_test: cannot catch SIGSEGV: %s.\n", strerror(errno)); exit(EXIT_FAILURE); } (void) setvbuf(stdout, NULL, _IOLBF, 0); dprintf_setup(&argc, argv); process_options(argc, argv); kernel_init(SPA_MODE_READ); /* setup random data because rand() is not reentrant */ rand_data = (int *)umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL); srand((unsigned)time(NULL) * getpid()); for (i = 0; i < SPA_MAXBLOCKSIZE / sizeof (int); i++) rand_data[i] = rand(); mprotect(rand_data, SPA_MAXBLOCKSIZE, PROT_READ); if (rto_opts.rto_benchmark) { run_raidz_benchmark(); } else if (rto_opts.rto_sweep) { err = run_sweep(); } else { err = run_test(NULL); } umem_free(rand_data, SPA_MAXBLOCKSIZE); kernel_fini(); return (err); } diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c index 7bda3f5292b3..ac51df0f9c10 100644 --- a/cmd/zfs/zfs_main.c +++ b/cmd/zfs/zfs_main.c @@ -1,8824 +1,8829 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright 2012 Milan Jurik. All rights reserved. * Copyright (c) 2012, Joyent, Inc. All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved. * Copyright 2016 Igor Kozhukhov . * Copyright 2016 Nexenta Systems, Inc. * Copyright (c) 2019 Datto Inc. * Copyright (c) 2019, loli10K * Copyright 2019 Joyent, Inc. * Copyright (c) 2019, 2020 by Christian Schwarz. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef HAVE_IDMAP #include #include #endif /* HAVE_IDMAP */ #include "zfs_iter.h" #include "zfs_util.h" #include "zfs_comutil.h" #include "zfs_projectutil.h" libzfs_handle_t *g_zfs; static char history_str[HIS_MAX_RECORD_LEN]; static boolean_t log_history = B_TRUE; static int zfs_do_clone(int argc, char **argv); static int zfs_do_create(int argc, char **argv); static int zfs_do_destroy(int argc, char **argv); static int zfs_do_get(int argc, char **argv); static int zfs_do_inherit(int argc, char **argv); static int zfs_do_list(int argc, char **argv); static int zfs_do_mount(int argc, char **argv); static int zfs_do_rename(int argc, char **argv); static int zfs_do_rollback(int argc, char **argv); static int zfs_do_set(int argc, char **argv); static int zfs_do_upgrade(int argc, char **argv); static int zfs_do_snapshot(int argc, char **argv); static int zfs_do_unmount(int argc, char **argv); static int zfs_do_share(int argc, char **argv); static int zfs_do_unshare(int argc, char **argv); static int zfs_do_send(int argc, char **argv); static int zfs_do_receive(int argc, char **argv); static int zfs_do_promote(int argc, char **argv); static int zfs_do_userspace(int argc, char **argv); static int zfs_do_allow(int argc, char **argv); static int zfs_do_unallow(int argc, char **argv); static int zfs_do_hold(int argc, char **argv); static int zfs_do_holds(int argc, char **argv); static int zfs_do_release(int argc, char **argv); static int zfs_do_diff(int argc, char **argv); static int zfs_do_bookmark(int argc, char **argv); static int zfs_do_channel_program(int argc, char **argv); static int zfs_do_load_key(int argc, char **argv); static int zfs_do_unload_key(int argc, char **argv); static int zfs_do_change_key(int argc, char **argv); static int zfs_do_project(int argc, char **argv); static int zfs_do_version(int argc, char **argv); static int zfs_do_redact(int argc, char **argv); static int zfs_do_wait(int argc, char **argv); #ifdef __FreeBSD__ static int zfs_do_jail(int argc, char **argv); static int zfs_do_unjail(int argc, char **argv); #endif #ifdef __linux__ static int zfs_do_zone(int argc, char **argv); static int zfs_do_unzone(int argc, char **argv); #endif /* * Enable a reasonable set of defaults for libumem debugging on DEBUG builds. */ #ifdef DEBUG const char * _umem_debug_init(void) { return ("default,verbose"); /* $UMEM_DEBUG setting */ } const char * _umem_logging_init(void) { return ("fail,contents"); /* $UMEM_LOGGING setting */ } #endif typedef enum { HELP_CLONE, HELP_CREATE, HELP_DESTROY, HELP_GET, HELP_INHERIT, HELP_UPGRADE, HELP_LIST, HELP_MOUNT, HELP_PROMOTE, HELP_RECEIVE, HELP_RENAME, HELP_ROLLBACK, HELP_SEND, HELP_SET, HELP_SHARE, HELP_SNAPSHOT, HELP_UNMOUNT, HELP_UNSHARE, HELP_ALLOW, HELP_UNALLOW, HELP_USERSPACE, HELP_GROUPSPACE, HELP_PROJECTSPACE, HELP_PROJECT, HELP_HOLD, HELP_HOLDS, HELP_RELEASE, HELP_DIFF, HELP_BOOKMARK, HELP_CHANNEL_PROGRAM, HELP_LOAD_KEY, HELP_UNLOAD_KEY, HELP_CHANGE_KEY, HELP_VERSION, HELP_REDACT, HELP_JAIL, HELP_UNJAIL, HELP_WAIT, HELP_ZONE, HELP_UNZONE, } zfs_help_t; typedef struct zfs_command { const char *name; int (*func)(int argc, char **argv); zfs_help_t usage; } zfs_command_t; /* * Master command table. Each ZFS command has a name, associated function, and * usage message. The usage messages need to be internationalized, so we have * to have a function to return the usage message based on a command index. * * These commands are organized according to how they are displayed in the usage * message. An empty command (one with a NULL name) indicates an empty line in * the generic usage message. */ static zfs_command_t command_table[] = { { "version", zfs_do_version, HELP_VERSION }, { NULL }, { "create", zfs_do_create, HELP_CREATE }, { "destroy", zfs_do_destroy, HELP_DESTROY }, { NULL }, { "snapshot", zfs_do_snapshot, HELP_SNAPSHOT }, { "rollback", zfs_do_rollback, HELP_ROLLBACK }, { "clone", zfs_do_clone, HELP_CLONE }, { "promote", zfs_do_promote, HELP_PROMOTE }, { "rename", zfs_do_rename, HELP_RENAME }, { "bookmark", zfs_do_bookmark, HELP_BOOKMARK }, { "program", zfs_do_channel_program, HELP_CHANNEL_PROGRAM }, { NULL }, { "list", zfs_do_list, HELP_LIST }, { NULL }, { "set", zfs_do_set, HELP_SET }, { "get", zfs_do_get, HELP_GET }, { "inherit", zfs_do_inherit, HELP_INHERIT }, { "upgrade", zfs_do_upgrade, HELP_UPGRADE }, { NULL }, { "userspace", zfs_do_userspace, HELP_USERSPACE }, { "groupspace", zfs_do_userspace, HELP_GROUPSPACE }, { "projectspace", zfs_do_userspace, HELP_PROJECTSPACE }, { NULL }, { "project", zfs_do_project, HELP_PROJECT }, { NULL }, { "mount", zfs_do_mount, HELP_MOUNT }, { "unmount", zfs_do_unmount, HELP_UNMOUNT }, { "share", zfs_do_share, HELP_SHARE }, { "unshare", zfs_do_unshare, HELP_UNSHARE }, { NULL }, { "send", zfs_do_send, HELP_SEND }, { "receive", zfs_do_receive, HELP_RECEIVE }, { NULL }, { "allow", zfs_do_allow, HELP_ALLOW }, { NULL }, { "unallow", zfs_do_unallow, HELP_UNALLOW }, { NULL }, { "hold", zfs_do_hold, HELP_HOLD }, { "holds", zfs_do_holds, HELP_HOLDS }, { "release", zfs_do_release, HELP_RELEASE }, { "diff", zfs_do_diff, HELP_DIFF }, { "load-key", zfs_do_load_key, HELP_LOAD_KEY }, { "unload-key", zfs_do_unload_key, HELP_UNLOAD_KEY }, { "change-key", zfs_do_change_key, HELP_CHANGE_KEY }, { "redact", zfs_do_redact, HELP_REDACT }, { "wait", zfs_do_wait, HELP_WAIT }, #ifdef __FreeBSD__ { "jail", zfs_do_jail, HELP_JAIL }, { "unjail", zfs_do_unjail, HELP_UNJAIL }, #endif #ifdef __linux__ { "zone", zfs_do_zone, HELP_ZONE }, { "unzone", zfs_do_unzone, HELP_UNZONE }, #endif }; #define NCOMMAND (sizeof (command_table) / sizeof (command_table[0])) zfs_command_t *current_command; static const char * get_usage(zfs_help_t idx) { switch (idx) { case HELP_CLONE: return (gettext("\tclone [-p] [-o property=value] ... " " \n")); case HELP_CREATE: return (gettext("\tcreate [-Pnpuv] [-o property=value] ... " "\n" "\tcreate [-Pnpsv] [-b blocksize] [-o property=value] ... " "-V \n")); case HELP_DESTROY: return (gettext("\tdestroy [-fnpRrv] \n" "\tdestroy [-dnpRrv] " "@[%][,...]\n" "\tdestroy #\n")); case HELP_GET: return (gettext("\tget [-rHp] [-d max] " "[-o \"all\" | field[,...]]\n" "\t [-t type[,...]] [-s source[,...]]\n" "\t <\"all\" | property[,...]> " "[filesystem|volume|snapshot|bookmark] ...\n")); case HELP_INHERIT: return (gettext("\tinherit [-rS] " " ...\n")); case HELP_UPGRADE: return (gettext("\tupgrade [-v]\n" "\tupgrade [-r] [-V version] <-a | filesystem ...>\n")); case HELP_LIST: return (gettext("\tlist [-Hp] [-r|-d max] [-o property[,...]] " "[-s property]...\n\t [-S property]... [-t type[,...]] " "[filesystem|volume|snapshot] ...\n")); case HELP_MOUNT: return (gettext("\tmount\n" "\tmount [-flvO] [-o opts] <-a | filesystem>\n")); case HELP_PROMOTE: return (gettext("\tpromote \n")); case HELP_RECEIVE: return (gettext("\treceive [-vMnsFhu] " "[-o =] ... [-x ] ...\n" "\t \n" "\treceive [-vMnsFhu] [-o =] ... " "[-x ] ... \n" "\t [-d | -e] \n" "\treceive -A \n")); case HELP_RENAME: return (gettext("\trename [-f] " "\n" "\trename -p [-f] \n" "\trename -u [-f] \n" "\trename -r \n")); case HELP_ROLLBACK: return (gettext("\trollback [-rRf] \n")); case HELP_SEND: return (gettext("\tsend [-DLPbcehnpsvw] " "[-i|-I snapshot]\n" "\t [-R [-X dataset[,dataset]...]] \n" "\tsend [-DnvPLecw] [-i snapshot|bookmark] " "\n" "\tsend [-DnPpvLec] [-i bookmark|snapshot] " "--redact \n" "\tsend [-nvPe] -t \n" "\tsend [-Pnv] --saved filesystem\n")); case HELP_SET: return (gettext("\tset ... " " ...\n")); case HELP_SHARE: return (gettext("\tshare [-l] <-a [nfs|smb] | filesystem>\n")); case HELP_SNAPSHOT: return (gettext("\tsnapshot [-r] [-o property=value] ... " "@ ...\n")); case HELP_UNMOUNT: return (gettext("\tunmount [-fu] " "<-a | filesystem|mountpoint>\n")); case HELP_UNSHARE: return (gettext("\tunshare " "<-a [nfs|smb] | filesystem|mountpoint>\n")); case HELP_ALLOW: return (gettext("\tallow \n" "\tallow [-ldug] " "<\"everyone\"|user|group>[,...] [,...]\n" "\t \n" "\tallow [-ld] -e [,...] " "\n" "\tallow -c [,...] \n" "\tallow -s @setname [,...] " "\n")); case HELP_UNALLOW: return (gettext("\tunallow [-rldug] " "<\"everyone\"|user|group>[,...]\n" "\t [[,...]] \n" "\tunallow [-rld] -e [[,...]] " "\n" "\tunallow [-r] -c [[,...]] " "\n" "\tunallow [-r] -s @setname [[,...]] " "\n")); case HELP_USERSPACE: return (gettext("\tuserspace [-Hinp] [-o field[,...]] " "[-s field] ...\n" "\t [-S field] ... [-t type[,...]] " "\n")); case HELP_GROUPSPACE: return (gettext("\tgroupspace [-Hinp] [-o field[,...]] " "[-s field] ...\n" "\t [-S field] ... [-t type[,...]] " "\n")); case HELP_PROJECTSPACE: return (gettext("\tprojectspace [-Hp] [-o field[,...]] " "[-s field] ... \n" "\t [-S field] ... \n")); case HELP_PROJECT: return (gettext("\tproject [-d|-r] \n" "\tproject -c [-0] [-d|-r] [-p id] \n" "\tproject -C [-k] [-r] \n" "\tproject [-p id] [-r] [-s] \n")); case HELP_HOLD: return (gettext("\thold [-r] ...\n")); case HELP_HOLDS: return (gettext("\tholds [-rH] ...\n")); case HELP_RELEASE: return (gettext("\trelease [-r] ...\n")); case HELP_DIFF: return (gettext("\tdiff [-FHt] " "[snapshot|filesystem]\n")); case HELP_BOOKMARK: return (gettext("\tbookmark " "\n")); case HELP_CHANNEL_PROGRAM: return (gettext("\tprogram [-jn] [-t ] " "[-m ]\n" "\t [lua args...]\n")); case HELP_LOAD_KEY: return (gettext("\tload-key [-rn] [-L ] " "<-a | filesystem|volume>\n")); case HELP_UNLOAD_KEY: return (gettext("\tunload-key [-r] " "<-a | filesystem|volume>\n")); case HELP_CHANGE_KEY: return (gettext("\tchange-key [-l] [-o keyformat=]\n" "\t [-o keylocation=] [-o pbkdf2iters=]\n" "\t \n" "\tchange-key -i [-l] \n")); case HELP_VERSION: return (gettext("\tversion\n")); case HELP_REDACT: return (gettext("\tredact " " ...\n")); case HELP_JAIL: return (gettext("\tjail \n")); case HELP_UNJAIL: return (gettext("\tunjail \n")); case HELP_WAIT: return (gettext("\twait [-t ] \n")); case HELP_ZONE: return (gettext("\tzone \n")); case HELP_UNZONE: return (gettext("\tunzone \n")); default: __builtin_unreachable(); } } void nomem(void) { (void) fprintf(stderr, gettext("internal error: out of memory\n")); exit(1); } /* * Utility function to guarantee malloc() success. */ void * safe_malloc(size_t size) { void *data; if ((data = calloc(1, size)) == NULL) nomem(); return (data); } static void * safe_realloc(void *data, size_t size) { void *newp; if ((newp = realloc(data, size)) == NULL) { free(data); nomem(); } return (newp); } static char * safe_strdup(const char *str) { char *dupstr = strdup(str); if (dupstr == NULL) nomem(); return (dupstr); } /* * Callback routine that will print out information for each of * the properties. */ static int usage_prop_cb(int prop, void *cb) { FILE *fp = cb; (void) fprintf(fp, "\t%-15s ", zfs_prop_to_name(prop)); if (zfs_prop_readonly(prop)) (void) fprintf(fp, " NO "); else (void) fprintf(fp, "YES "); if (zfs_prop_inheritable(prop)) (void) fprintf(fp, " YES "); else (void) fprintf(fp, " NO "); (void) fprintf(fp, "%s\n", zfs_prop_values(prop) ?: "-"); return (ZPROP_CONT); } /* * Display usage message. If we're inside a command, display only the usage for * that command. Otherwise, iterate over the entire command table and display * a complete usage message. */ static __attribute__((noreturn)) void usage(boolean_t requested) { int i; boolean_t show_properties = B_FALSE; FILE *fp = requested ? stdout : stderr; if (current_command == NULL) { (void) fprintf(fp, gettext("usage: zfs command args ...\n")); (void) fprintf(fp, gettext("where 'command' is one of the following:\n\n")); for (i = 0; i < NCOMMAND; i++) { if (command_table[i].name == NULL) (void) fprintf(fp, "\n"); else (void) fprintf(fp, "%s", get_usage(command_table[i].usage)); } (void) fprintf(fp, gettext("\nEach dataset is of the form: " "pool/[dataset/]*dataset[@name]\n")); } else { (void) fprintf(fp, gettext("usage:\n")); (void) fprintf(fp, "%s", get_usage(current_command->usage)); } if (current_command != NULL && (strcmp(current_command->name, "set") == 0 || strcmp(current_command->name, "get") == 0 || strcmp(current_command->name, "inherit") == 0 || strcmp(current_command->name, "list") == 0)) show_properties = B_TRUE; if (show_properties) { (void) fprintf(fp, gettext("\nThe following properties are supported:\n")); (void) fprintf(fp, "\n\t%-14s %s %s %s\n\n", "PROPERTY", "EDIT", "INHERIT", "VALUES"); /* Iterate over all properties */ (void) zprop_iter(usage_prop_cb, fp, B_FALSE, B_TRUE, ZFS_TYPE_DATASET); (void) fprintf(fp, "\t%-15s ", "userused@..."); (void) fprintf(fp, " NO NO \n"); (void) fprintf(fp, "\t%-15s ", "groupused@..."); (void) fprintf(fp, " NO NO \n"); (void) fprintf(fp, "\t%-15s ", "projectused@..."); (void) fprintf(fp, " NO NO \n"); (void) fprintf(fp, "\t%-15s ", "userobjused@..."); (void) fprintf(fp, " NO NO \n"); (void) fprintf(fp, "\t%-15s ", "groupobjused@..."); (void) fprintf(fp, " NO NO \n"); (void) fprintf(fp, "\t%-15s ", "projectobjused@..."); (void) fprintf(fp, " NO NO \n"); (void) fprintf(fp, "\t%-15s ", "userquota@..."); (void) fprintf(fp, "YES NO | none\n"); (void) fprintf(fp, "\t%-15s ", "groupquota@..."); (void) fprintf(fp, "YES NO | none\n"); (void) fprintf(fp, "\t%-15s ", "projectquota@..."); (void) fprintf(fp, "YES NO | none\n"); (void) fprintf(fp, "\t%-15s ", "userobjquota@..."); (void) fprintf(fp, "YES NO | none\n"); (void) fprintf(fp, "\t%-15s ", "groupobjquota@..."); (void) fprintf(fp, "YES NO | none\n"); (void) fprintf(fp, "\t%-15s ", "projectobjquota@..."); (void) fprintf(fp, "YES NO | none\n"); (void) fprintf(fp, "\t%-15s ", "written@"); (void) fprintf(fp, " NO NO \n"); (void) fprintf(fp, "\t%-15s ", "written#"); (void) fprintf(fp, " NO NO \n"); (void) fprintf(fp, gettext("\nSizes are specified in bytes " "with standard units such as K, M, G, etc.\n")); (void) fprintf(fp, "%s", gettext("\nUser-defined properties " "can be specified by using a name containing a colon " "(:).\n")); (void) fprintf(fp, gettext("\nThe {user|group|project}" "[obj]{used|quota}@ properties must be appended with\n" "a user|group|project specifier of one of these forms:\n" " POSIX name (eg: \"matt\")\n" " POSIX id (eg: \"126829\")\n" " SMB name@domain (eg: \"matt@sun\")\n" " SMB SID (eg: \"S-1-234-567-89\")\n")); } else { (void) fprintf(fp, gettext("\nFor the property list, run: %s\n"), "zfs set|get"); (void) fprintf(fp, gettext("\nFor the delegated permission list, run: %s\n"), "zfs allow|unallow"); } /* * See comments at end of main(). */ if (getenv("ZFS_ABORT") != NULL) { (void) printf("dumping core by request\n"); abort(); } exit(requested ? 0 : 2); } /* * Take a property=value argument string and add it to the given nvlist. * Modifies the argument inplace. */ static boolean_t parseprop(nvlist_t *props, char *propname) { char *propval; if ((propval = strchr(propname, '=')) == NULL) { (void) fprintf(stderr, gettext("missing " "'=' for property=value argument\n")); return (B_FALSE); } *propval = '\0'; propval++; if (nvlist_exists(props, propname)) { (void) fprintf(stderr, gettext("property '%s' " "specified multiple times\n"), propname); return (B_FALSE); } if (nvlist_add_string(props, propname, propval) != 0) nomem(); return (B_TRUE); } /* * Take a property name argument and add it to the given nvlist. * Modifies the argument inplace. */ static boolean_t parsepropname(nvlist_t *props, char *propname) { if (strchr(propname, '=') != NULL) { (void) fprintf(stderr, gettext("invalid character " "'=' in property argument\n")); return (B_FALSE); } if (nvlist_exists(props, propname)) { (void) fprintf(stderr, gettext("property '%s' " "specified multiple times\n"), propname); return (B_FALSE); } if (nvlist_add_boolean(props, propname) != 0) nomem(); return (B_TRUE); } static int parse_depth(char *opt, int *flags) { char *tmp; int depth; depth = (int)strtol(opt, &tmp, 0); if (*tmp) { (void) fprintf(stderr, gettext("%s is not an integer\n"), optarg); usage(B_FALSE); } if (depth < 0) { (void) fprintf(stderr, gettext("Depth can not be negative.\n")); usage(B_FALSE); } *flags |= (ZFS_ITER_DEPTH_LIMIT|ZFS_ITER_RECURSE); return (depth); } #define PROGRESS_DELAY 2 /* seconds */ static const char *pt_reverse = "\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b"; static time_t pt_begin; static char *pt_header = NULL; static boolean_t pt_shown; static void start_progress_timer(void) { pt_begin = time(NULL) + PROGRESS_DELAY; pt_shown = B_FALSE; } static void set_progress_header(const char *header) { assert(pt_header == NULL); pt_header = safe_strdup(header); if (pt_shown) { (void) printf("%s: ", header); (void) fflush(stdout); } } static void update_progress(const char *update) { if (!pt_shown && time(NULL) > pt_begin) { int len = strlen(update); (void) printf("%s: %s%*.*s", pt_header, update, len, len, pt_reverse); (void) fflush(stdout); pt_shown = B_TRUE; } else if (pt_shown) { int len = strlen(update); (void) printf("%s%*.*s", update, len, len, pt_reverse); (void) fflush(stdout); } } static void finish_progress(const char *done) { if (pt_shown) { (void) puts(done); (void) fflush(stdout); } free(pt_header); pt_header = NULL; } static int zfs_mount_and_share(libzfs_handle_t *hdl, const char *dataset, zfs_type_t type) { zfs_handle_t *zhp = NULL; int ret = 0; zhp = zfs_open(hdl, dataset, type); if (zhp == NULL) return (1); /* * Volumes may neither be mounted or shared. Potentially in the * future filesystems detected on these volumes could be mounted. */ if (zfs_get_type(zhp) == ZFS_TYPE_VOLUME) { zfs_close(zhp); return (0); } /* * Mount and/or share the new filesystem as appropriate. We provide a * verbose error message to let the user know that their filesystem was * in fact created, even if we failed to mount or share it. * * If the user doesn't want the dataset automatically mounted, then * skip the mount/share step */ if (zfs_prop_valid_for_type(ZFS_PROP_CANMOUNT, type, B_FALSE) && zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT) == ZFS_CANMOUNT_ON) { if (zfs_mount_delegation_check()) { (void) fprintf(stderr, gettext("filesystem " "successfully created, but it may only be " "mounted by root\n")); ret = 1; } else if (zfs_mount(zhp, NULL, 0) != 0) { (void) fprintf(stderr, gettext("filesystem " "successfully created, but not mounted\n")); ret = 1; } else if (zfs_share(zhp, NULL) != 0) { (void) fprintf(stderr, gettext("filesystem " "successfully created, but not shared\n")); ret = 1; } zfs_commit_shares(NULL); } zfs_close(zhp); return (ret); } /* * zfs clone [-p] [-o prop=value] ... * * Given an existing dataset, create a writable copy whose initial contents * are the same as the source. The newly created dataset maintains a * dependency on the original; the original cannot be destroyed so long as * the clone exists. * * The '-p' flag creates all the non-existing ancestors of the target first. */ static int zfs_do_clone(int argc, char **argv) { zfs_handle_t *zhp = NULL; boolean_t parents = B_FALSE; nvlist_t *props; int ret = 0; int c; if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) nomem(); /* check options */ while ((c = getopt(argc, argv, "o:p")) != -1) { switch (c) { case 'o': if (!parseprop(props, optarg)) { nvlist_free(props); return (1); } break; case 'p': parents = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); goto usage; } } argc -= optind; argv += optind; /* check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing source dataset " "argument\n")); goto usage; } if (argc < 2) { (void) fprintf(stderr, gettext("missing target dataset " "argument\n")); goto usage; } if (argc > 2) { (void) fprintf(stderr, gettext("too many arguments\n")); goto usage; } /* open the source dataset */ if ((zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_SNAPSHOT)) == NULL) { nvlist_free(props); return (1); } if (parents && zfs_name_valid(argv[1], ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME)) { /* * Now create the ancestors of the target dataset. If the * target already exists and '-p' option was used we should not * complain. */ if (zfs_dataset_exists(g_zfs, argv[1], ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME)) { zfs_close(zhp); nvlist_free(props); return (0); } if (zfs_create_ancestors(g_zfs, argv[1]) != 0) { zfs_close(zhp); nvlist_free(props); return (1); } } /* pass to libzfs */ ret = zfs_clone(zhp, argv[1], props); /* create the mountpoint if necessary */ if (ret == 0) { if (log_history) { (void) zpool_log_history(g_zfs, history_str); log_history = B_FALSE; } ret = zfs_mount_and_share(g_zfs, argv[1], ZFS_TYPE_DATASET); } zfs_close(zhp); nvlist_free(props); return (!!ret); usage: ASSERT3P(zhp, ==, NULL); nvlist_free(props); usage(B_FALSE); return (-1); } /* * Return a default volblocksize for the pool which always uses more than * half of the data sectors. This primarily applies to dRAID which always * writes full stripe widths. */ static uint64_t default_volblocksize(zpool_handle_t *zhp, nvlist_t *props) { uint64_t volblocksize, asize = SPA_MINBLOCKSIZE; nvlist_t *tree, **vdevs; uint_t nvdevs; nvlist_t *config = zpool_get_config(zhp, NULL); if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &tree) != 0 || nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN, &vdevs, &nvdevs) != 0) { return (ZVOL_DEFAULT_BLOCKSIZE); } for (int i = 0; i < nvdevs; i++) { nvlist_t *nv = vdevs[i]; uint64_t ashift, ndata, nparity; if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &ashift) != 0) continue; if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DRAID_NDATA, &ndata) == 0) { /* dRAID minimum allocation width */ asize = MAX(asize, ndata * (1ULL << ashift)); } else if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) == 0) { /* raidz minimum allocation width */ if (nparity == 1) asize = MAX(asize, 2 * (1ULL << ashift)); else asize = MAX(asize, 4 * (1ULL << ashift)); } else { /* mirror or (non-redundant) leaf vdev */ asize = MAX(asize, 1ULL << ashift); } } /* * Calculate the target volblocksize such that more than half * of the asize is used. The following table is for 4k sectors. * * n asize blksz used | n asize blksz used * -------------------------+--------------------------------- * 1 4,096 8,192 100% | 9 36,864 32,768 88% * 2 8,192 8,192 100% | 10 40,960 32,768 80% * 3 12,288 8,192 66% | 11 45,056 32,768 72% * 4 16,384 16,384 100% | 12 49,152 32,768 66% * 5 20,480 16,384 80% | 13 53,248 32,768 61% * 6 24,576 16,384 66% | 14 57,344 32,768 57% * 7 28,672 16,384 57% | 15 61,440 32,768 53% * 8 32,768 32,768 100% | 16 65,536 65,636 100% * * This is primarily a concern for dRAID which always allocates * a full stripe width. For dRAID the default stripe width is * n=8 in which case the volblocksize is set to 32k. Ignoring * compression there are no unused sectors. This same reasoning * applies to raidz[2,3] so target 4 sectors to minimize waste. */ uint64_t tgt_volblocksize = ZVOL_DEFAULT_BLOCKSIZE; while (tgt_volblocksize * 2 <= asize) tgt_volblocksize *= 2; const char *prop = zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE); if (nvlist_lookup_uint64(props, prop, &volblocksize) == 0) { /* Issue a warning when a non-optimal size is requested. */ if (volblocksize < ZVOL_DEFAULT_BLOCKSIZE) { (void) fprintf(stderr, gettext("Warning: " "volblocksize (%llu) is less than the default " "minimum block size (%llu).\nTo reduce wasted " "space a volblocksize of %llu is recommended.\n"), (u_longlong_t)volblocksize, (u_longlong_t)ZVOL_DEFAULT_BLOCKSIZE, (u_longlong_t)tgt_volblocksize); } else if (volblocksize < tgt_volblocksize) { (void) fprintf(stderr, gettext("Warning: " "volblocksize (%llu) is much less than the " "minimum allocation\nunit (%llu), which wastes " "at least %llu%% of space. To reduce wasted " "space,\nuse a larger volblocksize (%llu is " "recommended), fewer dRAID data disks\n" "per group, or smaller sector size (ashift).\n"), (u_longlong_t)volblocksize, (u_longlong_t)asize, (u_longlong_t)((100 * (asize - volblocksize)) / asize), (u_longlong_t)tgt_volblocksize); } } else { volblocksize = tgt_volblocksize; fnvlist_add_uint64(props, prop, volblocksize); } return (volblocksize); } /* * zfs create [-Pnpv] [-o prop=value] ... fs * zfs create [-Pnpsv] [-b blocksize] [-o prop=value] ... -V vol size * * Create a new dataset. This command can be used to create filesystems * and volumes. Snapshot creation is handled by 'zfs snapshot'. * For volumes, the user must specify a size to be used. * * The '-s' flag applies only to volumes, and indicates that we should not try * to set the reservation for this volume. By default we set a reservation * equal to the size for any volume. For pools with SPA_VERSION >= * SPA_VERSION_REFRESERVATION, we set a refreservation instead. * * The '-p' flag creates all the non-existing ancestors of the target first. * * The '-n' flag is no-op (dry run) mode. This will perform a user-space sanity * check of arguments and properties, but does not check for permissions, * available space, etc. * * The '-u' flag prevents the newly created file system from being mounted. * * The '-v' flag is for verbose output. * * The '-P' flag is used for parseable output. It implies '-v'. */ static int zfs_do_create(int argc, char **argv) { zfs_type_t type = ZFS_TYPE_FILESYSTEM; zpool_handle_t *zpool_handle = NULL; nvlist_t *real_props = NULL; uint64_t volsize = 0; int c; boolean_t noreserve = B_FALSE; boolean_t bflag = B_FALSE; boolean_t parents = B_FALSE; boolean_t dryrun = B_FALSE; boolean_t nomount = B_FALSE; boolean_t verbose = B_FALSE; boolean_t parseable = B_FALSE; int ret = 1; nvlist_t *props; uint64_t intval; char *strval; if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) nomem(); /* check options */ while ((c = getopt(argc, argv, ":PV:b:nso:puv")) != -1) { switch (c) { case 'V': type = ZFS_TYPE_VOLUME; if (zfs_nicestrtonum(g_zfs, optarg, &intval) != 0) { (void) fprintf(stderr, gettext("bad volume " "size '%s': %s\n"), optarg, libzfs_error_description(g_zfs)); goto error; } if (nvlist_add_uint64(props, zfs_prop_to_name(ZFS_PROP_VOLSIZE), intval) != 0) nomem(); volsize = intval; break; case 'P': verbose = B_TRUE; parseable = B_TRUE; break; case 'p': parents = B_TRUE; break; case 'b': bflag = B_TRUE; if (zfs_nicestrtonum(g_zfs, optarg, &intval) != 0) { (void) fprintf(stderr, gettext("bad volume " "block size '%s': %s\n"), optarg, libzfs_error_description(g_zfs)); goto error; } if (nvlist_add_uint64(props, zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), intval) != 0) nomem(); break; case 'n': dryrun = B_TRUE; break; case 'o': if (!parseprop(props, optarg)) goto error; break; case 's': noreserve = B_TRUE; break; case 'u': nomount = B_TRUE; break; case 'v': verbose = B_TRUE; break; case ':': (void) fprintf(stderr, gettext("missing size " "argument\n")); goto badusage; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); goto badusage; } } if ((bflag || noreserve) && type != ZFS_TYPE_VOLUME) { (void) fprintf(stderr, gettext("'-s' and '-b' can only be " "used when creating a volume\n")); goto badusage; } if (nomount && type != ZFS_TYPE_FILESYSTEM) { (void) fprintf(stderr, gettext("'-u' can only be " "used when creating a filesystem\n")); goto badusage; } argc -= optind; argv += optind; /* check number of arguments */ if (argc == 0) { (void) fprintf(stderr, gettext("missing %s argument\n"), zfs_type_to_name(type)); goto badusage; } if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); goto badusage; } if (dryrun || type == ZFS_TYPE_VOLUME) { char msg[ZFS_MAX_DATASET_NAME_LEN * 2]; char *p; if ((p = strchr(argv[0], '/')) != NULL) *p = '\0'; zpool_handle = zpool_open(g_zfs, argv[0]); if (p != NULL) *p = '/'; if (zpool_handle == NULL) goto error; (void) snprintf(msg, sizeof (msg), dryrun ? gettext("cannot verify '%s'") : gettext("cannot create '%s'"), argv[0]); if (props && (real_props = zfs_valid_proplist(g_zfs, type, props, 0, NULL, zpool_handle, B_TRUE, msg)) == NULL) { zpool_close(zpool_handle); goto error; } } if (type == ZFS_TYPE_VOLUME) { const char *prop = zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE); uint64_t volblocksize = default_volblocksize(zpool_handle, real_props); if (volblocksize != ZVOL_DEFAULT_BLOCKSIZE && nvlist_lookup_string(props, prop, &strval) != 0) { if (asprintf(&strval, "%llu", (u_longlong_t)volblocksize) == -1) nomem(); nvlist_add_string(props, prop, strval); free(strval); } /* * If volsize is not a multiple of volblocksize, round it * up to the nearest multiple of the volblocksize. */ if (volsize % volblocksize) { volsize = P2ROUNDUP_TYPED(volsize, volblocksize, uint64_t); if (nvlist_add_uint64(props, zfs_prop_to_name(ZFS_PROP_VOLSIZE), volsize) != 0) { nvlist_free(props); nomem(); } } } if (type == ZFS_TYPE_VOLUME && !noreserve) { uint64_t spa_version; zfs_prop_t resv_prop; spa_version = zpool_get_prop_int(zpool_handle, ZPOOL_PROP_VERSION, NULL); if (spa_version >= SPA_VERSION_REFRESERVATION) resv_prop = ZFS_PROP_REFRESERVATION; else resv_prop = ZFS_PROP_RESERVATION; volsize = zvol_volsize_to_reservation(zpool_handle, volsize, real_props); if (nvlist_lookup_string(props, zfs_prop_to_name(resv_prop), &strval) != 0) { if (nvlist_add_uint64(props, zfs_prop_to_name(resv_prop), volsize) != 0) { nvlist_free(props); nomem(); } } } if (zpool_handle != NULL) { zpool_close(zpool_handle); nvlist_free(real_props); } if (parents && zfs_name_valid(argv[0], type)) { /* * Now create the ancestors of target dataset. If the target * already exists and '-p' option was used we should not * complain. */ if (zfs_dataset_exists(g_zfs, argv[0], type)) { ret = 0; goto error; } if (verbose) { (void) printf(parseable ? "create_ancestors\t%s\n" : dryrun ? "would create ancestors of %s\n" : "create ancestors of %s\n", argv[0]); } if (!dryrun) { if (zfs_create_ancestors(g_zfs, argv[0]) != 0) { goto error; } } } if (verbose) { nvpair_t *nvp = NULL; (void) printf(parseable ? "create\t%s\n" : dryrun ? "would create %s\n" : "create %s\n", argv[0]); while ((nvp = nvlist_next_nvpair(props, nvp)) != NULL) { uint64_t uval; char *sval; switch (nvpair_type(nvp)) { case DATA_TYPE_UINT64: VERIFY0(nvpair_value_uint64(nvp, &uval)); (void) printf(parseable ? "property\t%s\t%llu\n" : "\t%s=%llu\n", nvpair_name(nvp), (u_longlong_t)uval); break; case DATA_TYPE_STRING: VERIFY0(nvpair_value_string(nvp, &sval)); (void) printf(parseable ? "property\t%s\t%s\n" : "\t%s=%s\n", nvpair_name(nvp), sval); break; default: (void) fprintf(stderr, "property '%s' " "has illegal type %d\n", nvpair_name(nvp), nvpair_type(nvp)); abort(); } } } if (dryrun) { ret = 0; goto error; } /* pass to libzfs */ if (zfs_create(g_zfs, argv[0], type, props) != 0) goto error; if (log_history) { (void) zpool_log_history(g_zfs, history_str); log_history = B_FALSE; } if (nomount) { ret = 0; goto error; } ret = zfs_mount_and_share(g_zfs, argv[0], ZFS_TYPE_DATASET); error: nvlist_free(props); return (ret); badusage: nvlist_free(props); usage(B_FALSE); return (2); } /* * zfs destroy [-rRf] * zfs destroy [-rRd] * * -r Recursively destroy all children * -R Recursively destroy all dependents, including clones * -f Force unmounting of any dependents * -d If we can't destroy now, mark for deferred destruction * * Destroys the given dataset. By default, it will unmount any filesystems, * and refuse to destroy a dataset that has any dependents. A dependent can * either be a child, or a clone of a child. */ typedef struct destroy_cbdata { boolean_t cb_first; boolean_t cb_force; boolean_t cb_recurse; boolean_t cb_error; boolean_t cb_doclones; zfs_handle_t *cb_target; boolean_t cb_defer_destroy; boolean_t cb_verbose; boolean_t cb_parsable; boolean_t cb_dryrun; nvlist_t *cb_nvl; nvlist_t *cb_batchedsnaps; /* first snap in contiguous run */ char *cb_firstsnap; /* previous snap in contiguous run */ char *cb_prevsnap; int64_t cb_snapused; char *cb_snapspec; char *cb_bookmark; uint64_t cb_snap_count; } destroy_cbdata_t; /* * Check for any dependents based on the '-r' or '-R' flags. */ static int destroy_check_dependent(zfs_handle_t *zhp, void *data) { destroy_cbdata_t *cbp = data; const char *tname = zfs_get_name(cbp->cb_target); const char *name = zfs_get_name(zhp); if (strncmp(tname, name, strlen(tname)) == 0 && (name[strlen(tname)] == '/' || name[strlen(tname)] == '@')) { /* * This is a direct descendant, not a clone somewhere else in * the hierarchy. */ if (cbp->cb_recurse) goto out; if (cbp->cb_first) { (void) fprintf(stderr, gettext("cannot destroy '%s': " "%s has children\n"), zfs_get_name(cbp->cb_target), zfs_type_to_name(zfs_get_type(cbp->cb_target))); (void) fprintf(stderr, gettext("use '-r' to destroy " "the following datasets:\n")); cbp->cb_first = B_FALSE; cbp->cb_error = B_TRUE; } (void) fprintf(stderr, "%s\n", zfs_get_name(zhp)); } else { /* * This is a clone. We only want to report this if the '-r' * wasn't specified, or the target is a snapshot. */ if (!cbp->cb_recurse && zfs_get_type(cbp->cb_target) != ZFS_TYPE_SNAPSHOT) goto out; if (cbp->cb_first) { (void) fprintf(stderr, gettext("cannot destroy '%s': " "%s has dependent clones\n"), zfs_get_name(cbp->cb_target), zfs_type_to_name(zfs_get_type(cbp->cb_target))); (void) fprintf(stderr, gettext("use '-R' to destroy " "the following datasets:\n")); cbp->cb_first = B_FALSE; cbp->cb_error = B_TRUE; cbp->cb_dryrun = B_TRUE; } (void) fprintf(stderr, "%s\n", zfs_get_name(zhp)); } out: zfs_close(zhp); return (0); } static int destroy_batched(destroy_cbdata_t *cb) { int error = zfs_destroy_snaps_nvl(g_zfs, cb->cb_batchedsnaps, B_FALSE); fnvlist_free(cb->cb_batchedsnaps); cb->cb_batchedsnaps = fnvlist_alloc(); return (error); } static int destroy_callback(zfs_handle_t *zhp, void *data) { destroy_cbdata_t *cb = data; const char *name = zfs_get_name(zhp); int error; if (cb->cb_verbose) { if (cb->cb_parsable) { (void) printf("destroy\t%s\n", name); } else if (cb->cb_dryrun) { (void) printf(gettext("would destroy %s\n"), name); } else { (void) printf(gettext("will destroy %s\n"), name); } } /* * Ignore pools (which we've already flagged as an error before getting * here). */ if (strchr(zfs_get_name(zhp), '/') == NULL && zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) { zfs_close(zhp); return (0); } if (cb->cb_dryrun) { zfs_close(zhp); return (0); } /* * We batch up all contiguous snapshots (even of different * filesystems) and destroy them with one ioctl. We can't * simply do all snap deletions and then all fs deletions, * because we must delete a clone before its origin. */ if (zfs_get_type(zhp) == ZFS_TYPE_SNAPSHOT) { cb->cb_snap_count++; fnvlist_add_boolean(cb->cb_batchedsnaps, name); - if (cb->cb_snap_count % 10 == 0 && cb->cb_defer_destroy) + if (cb->cb_snap_count % 10 == 0 && cb->cb_defer_destroy) { error = destroy_batched(cb); + if (error != 0) { + zfs_close(zhp); + return (-1); + } + } } else { error = destroy_batched(cb); if (error != 0 || zfs_unmount(zhp, NULL, cb->cb_force ? MS_FORCE : 0) != 0 || zfs_destroy(zhp, cb->cb_defer_destroy) != 0) { zfs_close(zhp); /* * When performing a recursive destroy we ignore errors * so that the recursive destroy could continue * destroying past problem datasets */ if (cb->cb_recurse) { cb->cb_error = B_TRUE; return (0); } return (-1); } } zfs_close(zhp); return (0); } static int destroy_print_cb(zfs_handle_t *zhp, void *arg) { destroy_cbdata_t *cb = arg; const char *name = zfs_get_name(zhp); int err = 0; if (nvlist_exists(cb->cb_nvl, name)) { if (cb->cb_firstsnap == NULL) cb->cb_firstsnap = strdup(name); if (cb->cb_prevsnap != NULL) free(cb->cb_prevsnap); /* this snap continues the current range */ cb->cb_prevsnap = strdup(name); if (cb->cb_firstsnap == NULL || cb->cb_prevsnap == NULL) nomem(); if (cb->cb_verbose) { if (cb->cb_parsable) { (void) printf("destroy\t%s\n", name); } else if (cb->cb_dryrun) { (void) printf(gettext("would destroy %s\n"), name); } else { (void) printf(gettext("will destroy %s\n"), name); } } } else if (cb->cb_firstsnap != NULL) { /* end of this range */ uint64_t used = 0; err = lzc_snaprange_space(cb->cb_firstsnap, cb->cb_prevsnap, &used); cb->cb_snapused += used; free(cb->cb_firstsnap); cb->cb_firstsnap = NULL; free(cb->cb_prevsnap); cb->cb_prevsnap = NULL; } zfs_close(zhp); return (err); } static int destroy_print_snapshots(zfs_handle_t *fs_zhp, destroy_cbdata_t *cb) { int err; assert(cb->cb_firstsnap == NULL); assert(cb->cb_prevsnap == NULL); err = zfs_iter_snapshots_sorted(fs_zhp, destroy_print_cb, cb, 0, 0); if (cb->cb_firstsnap != NULL) { uint64_t used = 0; if (err == 0) { err = lzc_snaprange_space(cb->cb_firstsnap, cb->cb_prevsnap, &used); } cb->cb_snapused += used; free(cb->cb_firstsnap); cb->cb_firstsnap = NULL; free(cb->cb_prevsnap); cb->cb_prevsnap = NULL; } return (err); } static int snapshot_to_nvl_cb(zfs_handle_t *zhp, void *arg) { destroy_cbdata_t *cb = arg; int err = 0; /* Check for clones. */ if (!cb->cb_doclones && !cb->cb_defer_destroy) { cb->cb_target = zhp; cb->cb_first = B_TRUE; err = zfs_iter_dependents(zhp, B_TRUE, destroy_check_dependent, cb); } if (err == 0) { if (nvlist_add_boolean(cb->cb_nvl, zfs_get_name(zhp))) nomem(); } zfs_close(zhp); return (err); } static int gather_snapshots(zfs_handle_t *zhp, void *arg) { destroy_cbdata_t *cb = arg; int err = 0; err = zfs_iter_snapspec(zhp, cb->cb_snapspec, snapshot_to_nvl_cb, cb); if (err == ENOENT) err = 0; if (err != 0) goto out; if (cb->cb_verbose) { err = destroy_print_snapshots(zhp, cb); if (err != 0) goto out; } if (cb->cb_recurse) err = zfs_iter_filesystems(zhp, gather_snapshots, cb); out: zfs_close(zhp); return (err); } static int destroy_clones(destroy_cbdata_t *cb) { nvpair_t *pair; for (pair = nvlist_next_nvpair(cb->cb_nvl, NULL); pair != NULL; pair = nvlist_next_nvpair(cb->cb_nvl, pair)) { zfs_handle_t *zhp = zfs_open(g_zfs, nvpair_name(pair), ZFS_TYPE_SNAPSHOT); if (zhp != NULL) { boolean_t defer = cb->cb_defer_destroy; int err; /* * We can't defer destroy non-snapshots, so set it to * false while destroying the clones. */ cb->cb_defer_destroy = B_FALSE; err = zfs_iter_dependents(zhp, B_FALSE, destroy_callback, cb); cb->cb_defer_destroy = defer; zfs_close(zhp); if (err != 0) return (err); } } return (0); } static int zfs_do_destroy(int argc, char **argv) { destroy_cbdata_t cb = { 0 }; int rv = 0; int err = 0; int c; zfs_handle_t *zhp = NULL; char *at, *pound; zfs_type_t type = ZFS_TYPE_DATASET; /* check options */ while ((c = getopt(argc, argv, "vpndfrR")) != -1) { switch (c) { case 'v': cb.cb_verbose = B_TRUE; break; case 'p': cb.cb_verbose = B_TRUE; cb.cb_parsable = B_TRUE; break; case 'n': cb.cb_dryrun = B_TRUE; break; case 'd': cb.cb_defer_destroy = B_TRUE; type = ZFS_TYPE_SNAPSHOT; break; case 'f': cb.cb_force = B_TRUE; break; case 'r': cb.cb_recurse = B_TRUE; break; case 'R': cb.cb_recurse = B_TRUE; cb.cb_doclones = B_TRUE; break; case '?': default: (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* check number of arguments */ if (argc == 0) { (void) fprintf(stderr, gettext("missing dataset argument\n")); usage(B_FALSE); } if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } at = strchr(argv[0], '@'); pound = strchr(argv[0], '#'); if (at != NULL) { /* Build the list of snaps to destroy in cb_nvl. */ cb.cb_nvl = fnvlist_alloc(); *at = '\0'; zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); if (zhp == NULL) { nvlist_free(cb.cb_nvl); return (1); } cb.cb_snapspec = at + 1; if (gather_snapshots(zfs_handle_dup(zhp), &cb) != 0 || cb.cb_error) { rv = 1; goto out; } if (nvlist_empty(cb.cb_nvl)) { (void) fprintf(stderr, gettext("could not find any " "snapshots to destroy; check snapshot names.\n")); rv = 1; goto out; } if (cb.cb_verbose) { char buf[16]; zfs_nicebytes(cb.cb_snapused, buf, sizeof (buf)); if (cb.cb_parsable) { (void) printf("reclaim\t%llu\n", (u_longlong_t)cb.cb_snapused); } else if (cb.cb_dryrun) { (void) printf(gettext("would reclaim %s\n"), buf); } else { (void) printf(gettext("will reclaim %s\n"), buf); } } if (!cb.cb_dryrun) { if (cb.cb_doclones) { cb.cb_batchedsnaps = fnvlist_alloc(); err = destroy_clones(&cb); if (err == 0) { err = zfs_destroy_snaps_nvl(g_zfs, cb.cb_batchedsnaps, B_FALSE); } if (err != 0) { rv = 1; goto out; } } if (err == 0) { err = zfs_destroy_snaps_nvl(g_zfs, cb.cb_nvl, cb.cb_defer_destroy); } } if (err != 0) rv = 1; } else if (pound != NULL) { int err; nvlist_t *nvl; if (cb.cb_dryrun) { (void) fprintf(stderr, "dryrun is not supported with bookmark\n"); return (-1); } if (cb.cb_defer_destroy) { (void) fprintf(stderr, "defer destroy is not supported with bookmark\n"); return (-1); } if (cb.cb_recurse) { (void) fprintf(stderr, "recursive is not supported with bookmark\n"); return (-1); } /* * Unfortunately, zfs_bookmark() doesn't honor the * casesensitivity setting. However, we can't simply * remove this check, because lzc_destroy_bookmarks() * ignores non-existent bookmarks, so this is necessary * to get a proper error message. */ if (!zfs_bookmark_exists(argv[0])) { (void) fprintf(stderr, gettext("bookmark '%s' " "does not exist.\n"), argv[0]); return (1); } nvl = fnvlist_alloc(); fnvlist_add_boolean(nvl, argv[0]); err = lzc_destroy_bookmarks(nvl, NULL); if (err != 0) { (void) zfs_standard_error(g_zfs, err, "cannot destroy bookmark"); } nvlist_free(nvl); return (err); } else { /* Open the given dataset */ if ((zhp = zfs_open(g_zfs, argv[0], type)) == NULL) return (1); cb.cb_target = zhp; /* * Perform an explicit check for pools before going any further. */ if (!cb.cb_recurse && strchr(zfs_get_name(zhp), '/') == NULL && zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) { (void) fprintf(stderr, gettext("cannot destroy '%s': " "operation does not apply to pools\n"), zfs_get_name(zhp)); (void) fprintf(stderr, gettext("use 'zfs destroy -r " "%s' to destroy all datasets in the pool\n"), zfs_get_name(zhp)); (void) fprintf(stderr, gettext("use 'zpool destroy %s' " "to destroy the pool itself\n"), zfs_get_name(zhp)); rv = 1; goto out; } /* * Check for any dependents and/or clones. */ cb.cb_first = B_TRUE; if (!cb.cb_doclones && zfs_iter_dependents(zhp, B_TRUE, destroy_check_dependent, &cb) != 0) { rv = 1; goto out; } if (cb.cb_error) { rv = 1; goto out; } cb.cb_batchedsnaps = fnvlist_alloc(); if (zfs_iter_dependents(zhp, B_FALSE, destroy_callback, &cb) != 0) { rv = 1; goto out; } /* * Do the real thing. The callback will close the * handle regardless of whether it succeeds or not. */ err = destroy_callback(zhp, &cb); zhp = NULL; if (err == 0) { err = zfs_destroy_snaps_nvl(g_zfs, cb.cb_batchedsnaps, cb.cb_defer_destroy); } if (err != 0 || cb.cb_error == B_TRUE) rv = 1; } out: fnvlist_free(cb.cb_batchedsnaps); fnvlist_free(cb.cb_nvl); if (zhp != NULL) zfs_close(zhp); return (rv); } static boolean_t is_recvd_column(zprop_get_cbdata_t *cbp) { int i; zfs_get_column_t col; for (i = 0; i < ZFS_GET_NCOLS && (col = cbp->cb_columns[i]) != GET_COL_NONE; i++) if (col == GET_COL_RECVD) return (B_TRUE); return (B_FALSE); } /* * zfs get [-rHp] [-o all | field[,field]...] [-s source[,source]...] * < all | property[,property]... > < fs | snap | vol > ... * * -r recurse over any child datasets * -H scripted mode. Headers are stripped, and fields are separated * by tabs instead of spaces. * -o Set of fields to display. One of "name,property,value, * received,source". Default is "name,property,value,source". * "all" is an alias for all five. * -s Set of sources to allow. One of * "local,default,inherited,received,temporary,none". Default is * all six. * -p Display values in parsable (literal) format. * * Prints properties for the given datasets. The user can control which * columns to display as well as which property types to allow. */ /* * Invoked to display the properties for a single dataset. */ static int get_callback(zfs_handle_t *zhp, void *data) { char buf[ZFS_MAXPROPLEN]; char rbuf[ZFS_MAXPROPLEN]; zprop_source_t sourcetype; char source[ZFS_MAX_DATASET_NAME_LEN]; zprop_get_cbdata_t *cbp = data; nvlist_t *user_props = zfs_get_user_props(zhp); zprop_list_t *pl = cbp->cb_proplist; nvlist_t *propval; const char *strval; const char *sourceval; boolean_t received = is_recvd_column(cbp); for (; pl != NULL; pl = pl->pl_next) { char *recvdval = NULL; /* * Skip the special fake placeholder. This will also skip over * the name property when 'all' is specified. */ if (pl->pl_prop == ZFS_PROP_NAME && pl == cbp->cb_proplist) continue; if (pl->pl_prop != ZPROP_USERPROP) { if (zfs_prop_get(zhp, pl->pl_prop, buf, sizeof (buf), &sourcetype, source, sizeof (source), cbp->cb_literal) != 0) { if (pl->pl_all) continue; if (!zfs_prop_valid_for_type(pl->pl_prop, ZFS_TYPE_DATASET, B_FALSE)) { (void) fprintf(stderr, gettext("No such property '%s'\n"), zfs_prop_to_name(pl->pl_prop)); continue; } sourcetype = ZPROP_SRC_NONE; (void) strlcpy(buf, "-", sizeof (buf)); } if (received && (zfs_prop_get_recvd(zhp, zfs_prop_to_name(pl->pl_prop), rbuf, sizeof (rbuf), cbp->cb_literal) == 0)) recvdval = rbuf; zprop_print_one_property(zfs_get_name(zhp), cbp, zfs_prop_to_name(pl->pl_prop), buf, sourcetype, source, recvdval); } else if (zfs_prop_userquota(pl->pl_user_prop)) { sourcetype = ZPROP_SRC_LOCAL; if (zfs_prop_get_userquota(zhp, pl->pl_user_prop, buf, sizeof (buf), cbp->cb_literal) != 0) { sourcetype = ZPROP_SRC_NONE; (void) strlcpy(buf, "-", sizeof (buf)); } zprop_print_one_property(zfs_get_name(zhp), cbp, pl->pl_user_prop, buf, sourcetype, source, NULL); } else if (zfs_prop_written(pl->pl_user_prop)) { sourcetype = ZPROP_SRC_LOCAL; if (zfs_prop_get_written(zhp, pl->pl_user_prop, buf, sizeof (buf), cbp->cb_literal) != 0) { sourcetype = ZPROP_SRC_NONE; (void) strlcpy(buf, "-", sizeof (buf)); } zprop_print_one_property(zfs_get_name(zhp), cbp, pl->pl_user_prop, buf, sourcetype, source, NULL); } else { if (nvlist_lookup_nvlist(user_props, pl->pl_user_prop, &propval) != 0) { if (pl->pl_all) continue; sourcetype = ZPROP_SRC_NONE; strval = "-"; } else { strval = fnvlist_lookup_string(propval, ZPROP_VALUE); sourceval = fnvlist_lookup_string(propval, ZPROP_SOURCE); if (strcmp(sourceval, zfs_get_name(zhp)) == 0) { sourcetype = ZPROP_SRC_LOCAL; } else if (strcmp(sourceval, ZPROP_SOURCE_VAL_RECVD) == 0) { sourcetype = ZPROP_SRC_RECEIVED; } else { sourcetype = ZPROP_SRC_INHERITED; (void) strlcpy(source, sourceval, sizeof (source)); } } if (received && (zfs_prop_get_recvd(zhp, pl->pl_user_prop, rbuf, sizeof (rbuf), cbp->cb_literal) == 0)) recvdval = rbuf; zprop_print_one_property(zfs_get_name(zhp), cbp, pl->pl_user_prop, strval, sourcetype, source, recvdval); } } return (0); } static int zfs_do_get(int argc, char **argv) { zprop_get_cbdata_t cb = { 0 }; int i, c, flags = ZFS_ITER_ARGS_CAN_BE_PATHS; int types = ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK; char *fields; int ret = 0; int limit = 0; zprop_list_t fake_name = { 0 }; /* * Set up default columns and sources. */ cb.cb_sources = ZPROP_SRC_ALL; cb.cb_columns[0] = GET_COL_NAME; cb.cb_columns[1] = GET_COL_PROPERTY; cb.cb_columns[2] = GET_COL_VALUE; cb.cb_columns[3] = GET_COL_SOURCE; cb.cb_type = ZFS_TYPE_DATASET; /* check options */ while ((c = getopt(argc, argv, ":d:o:s:rt:Hp")) != -1) { switch (c) { case 'p': cb.cb_literal = B_TRUE; break; case 'd': limit = parse_depth(optarg, &flags); break; case 'r': flags |= ZFS_ITER_RECURSE; break; case 'H': cb.cb_scripted = B_TRUE; break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); usage(B_FALSE); break; case 'o': /* * Process the set of columns to display. We zero out * the structure to give us a blank slate. */ memset(&cb.cb_columns, 0, sizeof (cb.cb_columns)); i = 0; for (char *tok; (tok = strsep(&optarg, ",")); ) { static const char *const col_subopts[] = { "name", "property", "value", "received", "source", "all" }; static const zfs_get_column_t col_subopt_col[] = { GET_COL_NAME, GET_COL_PROPERTY, GET_COL_VALUE, GET_COL_RECVD, GET_COL_SOURCE }; static const int col_subopt_flags[] = { 0, 0, 0, ZFS_ITER_RECVD_PROPS, 0 }; if (i == ZFS_GET_NCOLS) { (void) fprintf(stderr, gettext("too " "many fields given to -o " "option\n")); usage(B_FALSE); } for (c = 0; c < ARRAY_SIZE(col_subopts); ++c) if (strcmp(tok, col_subopts[c]) == 0) goto found; (void) fprintf(stderr, gettext("invalid column name '%s'\n"), tok); usage(B_FALSE); found: if (c >= 5) { if (i > 0) { (void) fprintf(stderr, gettext("\"all\" conflicts " "with specific fields " "given to -o option\n")); usage(B_FALSE); } memcpy(cb.cb_columns, col_subopt_col, sizeof (col_subopt_col)); flags |= ZFS_ITER_RECVD_PROPS; i = ZFS_GET_NCOLS; } else { cb.cb_columns[i++] = col_subopt_col[c]; flags |= col_subopt_flags[c]; } } break; case 's': cb.cb_sources = 0; for (char *tok; (tok = strsep(&optarg, ",")); ) { static const char *const source_opt[] = { "local", "default", "inherited", "received", "temporary", "none" }; static const int source_flg[] = { ZPROP_SRC_LOCAL, ZPROP_SRC_DEFAULT, ZPROP_SRC_INHERITED, ZPROP_SRC_RECEIVED, ZPROP_SRC_TEMPORARY, ZPROP_SRC_NONE }; for (i = 0; i < ARRAY_SIZE(source_opt); ++i) if (strcmp(tok, source_opt[i]) == 0) { cb.cb_sources |= source_flg[i]; goto found2; } (void) fprintf(stderr, gettext("invalid source '%s'\n"), tok); usage(B_FALSE); found2:; } break; case 't': types = 0; flags &= ~ZFS_ITER_PROP_LISTSNAPS; for (char *tok; (tok = strsep(&optarg, ",")); ) { static const char *const type_opts[] = { "filesystem", "volume", "snapshot", "snap", "bookmark", "all" }; static const int type_types[] = { ZFS_TYPE_FILESYSTEM, ZFS_TYPE_VOLUME, ZFS_TYPE_SNAPSHOT, ZFS_TYPE_SNAPSHOT, ZFS_TYPE_BOOKMARK, ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK }; for (i = 0; i < ARRAY_SIZE(type_opts); ++i) if (strcmp(tok, type_opts[i]) == 0) { types |= type_types[i]; goto found3; } (void) fprintf(stderr, gettext("invalid type '%s'\n"), tok); usage(B_FALSE); found3:; } break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; if (argc < 1) { (void) fprintf(stderr, gettext("missing property " "argument\n")); usage(B_FALSE); } fields = argv[0]; /* * Handle users who want to get all snapshots or bookmarks * of a dataset (ex. 'zfs get -t snapshot refer '). */ if ((types == ZFS_TYPE_SNAPSHOT || types == ZFS_TYPE_BOOKMARK) && argc > 1 && (flags & ZFS_ITER_RECURSE) == 0 && limit == 0) { flags |= (ZFS_ITER_DEPTH_LIMIT | ZFS_ITER_RECURSE); limit = 1; } if (zprop_get_list(g_zfs, fields, &cb.cb_proplist, ZFS_TYPE_DATASET) != 0) usage(B_FALSE); argc--; argv++; /* * As part of zfs_expand_proplist(), we keep track of the maximum column * width for each property. For the 'NAME' (and 'SOURCE') columns, we * need to know the maximum name length. However, the user likely did * not specify 'name' as one of the properties to fetch, so we need to * make sure we always include at least this property for * print_get_headers() to work properly. */ if (cb.cb_proplist != NULL) { fake_name.pl_prop = ZFS_PROP_NAME; fake_name.pl_width = strlen(gettext("NAME")); fake_name.pl_next = cb.cb_proplist; cb.cb_proplist = &fake_name; } cb.cb_first = B_TRUE; /* run for each object */ ret = zfs_for_each(argc, argv, flags, types, NULL, &cb.cb_proplist, limit, get_callback, &cb); if (cb.cb_proplist == &fake_name) zprop_free_list(fake_name.pl_next); else zprop_free_list(cb.cb_proplist); return (ret); } /* * inherit [-rS] ... * * -r Recurse over all children * -S Revert to received value, if any * * For each dataset specified on the command line, inherit the given property * from its parent. Inheriting a property at the pool level will cause it to * use the default value. The '-r' flag will recurse over all children, and is * useful for setting a property on a hierarchy-wide basis, regardless of any * local modifications for each dataset. */ typedef struct inherit_cbdata { const char *cb_propname; boolean_t cb_received; } inherit_cbdata_t; static int inherit_recurse_cb(zfs_handle_t *zhp, void *data) { inherit_cbdata_t *cb = data; zfs_prop_t prop = zfs_name_to_prop(cb->cb_propname); /* * If we're doing it recursively, then ignore properties that * are not valid for this type of dataset. */ if (prop != ZPROP_INVAL && !zfs_prop_valid_for_type(prop, zfs_get_type(zhp), B_FALSE)) return (0); return (zfs_prop_inherit(zhp, cb->cb_propname, cb->cb_received) != 0); } static int inherit_cb(zfs_handle_t *zhp, void *data) { inherit_cbdata_t *cb = data; return (zfs_prop_inherit(zhp, cb->cb_propname, cb->cb_received) != 0); } static int zfs_do_inherit(int argc, char **argv) { int c; zfs_prop_t prop; inherit_cbdata_t cb = { 0 }; char *propname; int ret = 0; int flags = 0; boolean_t received = B_FALSE; /* check options */ while ((c = getopt(argc, argv, "rS")) != -1) { switch (c) { case 'r': flags |= ZFS_ITER_RECURSE; break; case 'S': received = B_TRUE; break; case '?': default: (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing property argument\n")); usage(B_FALSE); } if (argc < 2) { (void) fprintf(stderr, gettext("missing dataset argument\n")); usage(B_FALSE); } propname = argv[0]; argc--; argv++; if ((prop = zfs_name_to_prop(propname)) != ZPROP_USERPROP) { if (zfs_prop_readonly(prop)) { (void) fprintf(stderr, gettext( "%s property is read-only\n"), propname); return (1); } if (!zfs_prop_inheritable(prop) && !received) { (void) fprintf(stderr, gettext("'%s' property cannot " "be inherited\n"), propname); if (prop == ZFS_PROP_QUOTA || prop == ZFS_PROP_RESERVATION || prop == ZFS_PROP_REFQUOTA || prop == ZFS_PROP_REFRESERVATION) { (void) fprintf(stderr, gettext("use 'zfs set " "%s=none' to clear\n"), propname); (void) fprintf(stderr, gettext("use 'zfs " "inherit -S %s' to revert to received " "value\n"), propname); } return (1); } if (received && (prop == ZFS_PROP_VOLSIZE || prop == ZFS_PROP_VERSION)) { (void) fprintf(stderr, gettext("'%s' property cannot " "be reverted to a received value\n"), propname); return (1); } } else if (!zfs_prop_user(propname)) { (void) fprintf(stderr, gettext("invalid property '%s'\n"), propname); usage(B_FALSE); } cb.cb_propname = propname; cb.cb_received = received; if (flags & ZFS_ITER_RECURSE) { ret = zfs_for_each(argc, argv, flags, ZFS_TYPE_DATASET, NULL, NULL, 0, inherit_recurse_cb, &cb); } else { ret = zfs_for_each(argc, argv, flags, ZFS_TYPE_DATASET, NULL, NULL, 0, inherit_cb, &cb); } return (ret); } typedef struct upgrade_cbdata { uint64_t cb_numupgraded; uint64_t cb_numsamegraded; uint64_t cb_numfailed; uint64_t cb_version; boolean_t cb_newer; boolean_t cb_foundone; char cb_lastfs[ZFS_MAX_DATASET_NAME_LEN]; } upgrade_cbdata_t; static int same_pool(zfs_handle_t *zhp, const char *name) { int len1 = strcspn(name, "/@"); const char *zhname = zfs_get_name(zhp); int len2 = strcspn(zhname, "/@"); if (len1 != len2) return (B_FALSE); return (strncmp(name, zhname, len1) == 0); } static int upgrade_list_callback(zfs_handle_t *zhp, void *data) { upgrade_cbdata_t *cb = data; int version = zfs_prop_get_int(zhp, ZFS_PROP_VERSION); /* list if it's old/new */ if ((!cb->cb_newer && version < ZPL_VERSION) || (cb->cb_newer && version > ZPL_VERSION)) { char *str; if (cb->cb_newer) { str = gettext("The following filesystems are " "formatted using a newer software version and\n" "cannot be accessed on the current system.\n\n"); } else { str = gettext("The following filesystems are " "out of date, and can be upgraded. After being\n" "upgraded, these filesystems (and any 'zfs send' " "streams generated from\n" "subsequent snapshots) will no longer be " "accessible by older software versions.\n\n"); } if (!cb->cb_foundone) { (void) puts(str); (void) printf(gettext("VER FILESYSTEM\n")); (void) printf(gettext("--- ------------\n")); cb->cb_foundone = B_TRUE; } (void) printf("%2u %s\n", version, zfs_get_name(zhp)); } return (0); } static int upgrade_set_callback(zfs_handle_t *zhp, void *data) { upgrade_cbdata_t *cb = data; int version = zfs_prop_get_int(zhp, ZFS_PROP_VERSION); int needed_spa_version; int spa_version; if (zfs_spa_version(zhp, &spa_version) < 0) return (-1); needed_spa_version = zfs_spa_version_map(cb->cb_version); if (needed_spa_version < 0) return (-1); if (spa_version < needed_spa_version) { /* can't upgrade */ (void) printf(gettext("%s: can not be " "upgraded; the pool version needs to first " "be upgraded\nto version %d\n\n"), zfs_get_name(zhp), needed_spa_version); cb->cb_numfailed++; return (0); } /* upgrade */ if (version < cb->cb_version) { char verstr[24]; (void) snprintf(verstr, sizeof (verstr), "%llu", (u_longlong_t)cb->cb_version); if (cb->cb_lastfs[0] && !same_pool(zhp, cb->cb_lastfs)) { /* * If they did "zfs upgrade -a", then we could * be doing ioctls to different pools. We need * to log this history once to each pool, and bypass * the normal history logging that happens in main(). */ (void) zpool_log_history(g_zfs, history_str); log_history = B_FALSE; } if (zfs_prop_set(zhp, "version", verstr) == 0) cb->cb_numupgraded++; else cb->cb_numfailed++; (void) strlcpy(cb->cb_lastfs, zfs_get_name(zhp), sizeof (cb->cb_lastfs)); } else if (version > cb->cb_version) { /* can't downgrade */ (void) printf(gettext("%s: can not be downgraded; " "it is already at version %u\n"), zfs_get_name(zhp), version); cb->cb_numfailed++; } else { cb->cb_numsamegraded++; } return (0); } /* * zfs upgrade * zfs upgrade -v * zfs upgrade [-r] [-V ] <-a | filesystem> */ static int zfs_do_upgrade(int argc, char **argv) { boolean_t all = B_FALSE; boolean_t showversions = B_FALSE; int ret = 0; upgrade_cbdata_t cb = { 0 }; int c; int flags = ZFS_ITER_ARGS_CAN_BE_PATHS; /* check options */ while ((c = getopt(argc, argv, "rvV:a")) != -1) { switch (c) { case 'r': flags |= ZFS_ITER_RECURSE; break; case 'v': showversions = B_TRUE; break; case 'V': if (zfs_prop_string_to_index(ZFS_PROP_VERSION, optarg, &cb.cb_version) != 0) { (void) fprintf(stderr, gettext("invalid version %s\n"), optarg); usage(B_FALSE); } break; case 'a': all = B_TRUE; break; case '?': default: (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; if ((!all && !argc) && ((flags & ZFS_ITER_RECURSE) | cb.cb_version)) usage(B_FALSE); if (showversions && (flags & ZFS_ITER_RECURSE || all || cb.cb_version || argc)) usage(B_FALSE); if ((all || argc) && (showversions)) usage(B_FALSE); if (all && argc) usage(B_FALSE); if (showversions) { /* Show info on available versions. */ (void) printf(gettext("The following filesystem versions are " "supported:\n\n")); (void) printf(gettext("VER DESCRIPTION\n")); (void) printf("--- -----------------------------------------" "---------------\n"); (void) printf(gettext(" 1 Initial ZFS filesystem version\n")); (void) printf(gettext(" 2 Enhanced directory entries\n")); (void) printf(gettext(" 3 Case insensitive and filesystem " "user identifier (FUID)\n")); (void) printf(gettext(" 4 userquota, groupquota " "properties\n")); (void) printf(gettext(" 5 System attributes\n")); (void) printf(gettext("\nFor more information on a particular " "version, including supported releases,\n")); (void) printf("see the ZFS Administration Guide.\n\n"); ret = 0; } else if (argc || all) { /* Upgrade filesystems */ if (cb.cb_version == 0) cb.cb_version = ZPL_VERSION; ret = zfs_for_each(argc, argv, flags, ZFS_TYPE_FILESYSTEM, NULL, NULL, 0, upgrade_set_callback, &cb); (void) printf(gettext("%llu filesystems upgraded\n"), (u_longlong_t)cb.cb_numupgraded); if (cb.cb_numsamegraded) { (void) printf(gettext("%llu filesystems already at " "this version\n"), (u_longlong_t)cb.cb_numsamegraded); } if (cb.cb_numfailed != 0) ret = 1; } else { /* List old-version filesystems */ boolean_t found; (void) printf(gettext("This system is currently running " "ZFS filesystem version %llu.\n\n"), ZPL_VERSION); flags |= ZFS_ITER_RECURSE; ret = zfs_for_each(0, NULL, flags, ZFS_TYPE_FILESYSTEM, NULL, NULL, 0, upgrade_list_callback, &cb); found = cb.cb_foundone; cb.cb_foundone = B_FALSE; cb.cb_newer = B_TRUE; - ret = zfs_for_each(0, NULL, flags, ZFS_TYPE_FILESYSTEM, + ret |= zfs_for_each(0, NULL, flags, ZFS_TYPE_FILESYSTEM, NULL, NULL, 0, upgrade_list_callback, &cb); if (!cb.cb_foundone && !found) { (void) printf(gettext("All filesystems are " "formatted with the current version.\n")); } } return (ret); } /* * zfs userspace [-Hinp] [-o field[,...]] [-s field [-s field]...] * [-S field [-S field]...] [-t type[,...]] * filesystem | snapshot | path * zfs groupspace [-Hinp] [-o field[,...]] [-s field [-s field]...] * [-S field [-S field]...] [-t type[,...]] * filesystem | snapshot | path * zfs projectspace [-Hp] [-o field[,...]] [-s field [-s field]...] * [-S field [-S field]...] filesystem | snapshot | path * * -H Scripted mode; elide headers and separate columns by tabs. * -i Translate SID to POSIX ID. * -n Print numeric ID instead of user/group name. * -o Control which fields to display. * -p Use exact (parsable) numeric output. * -s Specify sort columns, descending order. * -S Specify sort columns, ascending order. * -t Control which object types to display. * * Displays space consumed by, and quotas on, each user in the specified * filesystem or snapshot. */ /* us_field_types, us_field_hdr and us_field_names should be kept in sync */ enum us_field_types { USFIELD_TYPE, USFIELD_NAME, USFIELD_USED, USFIELD_QUOTA, USFIELD_OBJUSED, USFIELD_OBJQUOTA }; static const char *const us_field_hdr[] = { "TYPE", "NAME", "USED", "QUOTA", "OBJUSED", "OBJQUOTA" }; static const char *const us_field_names[] = { "type", "name", "used", "quota", "objused", "objquota" }; #define USFIELD_LAST (sizeof (us_field_names) / sizeof (char *)) #define USTYPE_PSX_GRP (1 << 0) #define USTYPE_PSX_USR (1 << 1) #define USTYPE_SMB_GRP (1 << 2) #define USTYPE_SMB_USR (1 << 3) #define USTYPE_PROJ (1 << 4) #define USTYPE_ALL \ (USTYPE_PSX_GRP | USTYPE_PSX_USR | USTYPE_SMB_GRP | USTYPE_SMB_USR | \ USTYPE_PROJ) static int us_type_bits[] = { USTYPE_PSX_GRP, USTYPE_PSX_USR, USTYPE_SMB_GRP, USTYPE_SMB_USR, USTYPE_ALL }; static const char *const us_type_names[] = { "posixgroup", "posixuser", "smbgroup", "smbuser", "all" }; typedef struct us_node { nvlist_t *usn_nvl; uu_avl_node_t usn_avlnode; uu_list_node_t usn_listnode; } us_node_t; typedef struct us_cbdata { nvlist_t **cb_nvlp; uu_avl_pool_t *cb_avl_pool; uu_avl_t *cb_avl; boolean_t cb_numname; boolean_t cb_nicenum; boolean_t cb_sid2posix; zfs_userquota_prop_t cb_prop; zfs_sort_column_t *cb_sortcol; size_t cb_width[USFIELD_LAST]; } us_cbdata_t; static boolean_t us_populated = B_FALSE; typedef struct { zfs_sort_column_t *si_sortcol; boolean_t si_numname; } us_sort_info_t; static int us_field_index(const char *field) { for (int i = 0; i < USFIELD_LAST; i++) { if (strcmp(field, us_field_names[i]) == 0) return (i); } return (-1); } static int us_compare(const void *larg, const void *rarg, void *unused) { const us_node_t *l = larg; const us_node_t *r = rarg; us_sort_info_t *si = (us_sort_info_t *)unused; zfs_sort_column_t *sortcol = si->si_sortcol; boolean_t numname = si->si_numname; nvlist_t *lnvl = l->usn_nvl; nvlist_t *rnvl = r->usn_nvl; int rc = 0; boolean_t lvb, rvb; for (; sortcol != NULL; sortcol = sortcol->sc_next) { char *lvstr = (char *)""; char *rvstr = (char *)""; uint32_t lv32 = 0; uint32_t rv32 = 0; uint64_t lv64 = 0; uint64_t rv64 = 0; zfs_prop_t prop = sortcol->sc_prop; const char *propname = NULL; boolean_t reverse = sortcol->sc_reverse; switch (prop) { case ZFS_PROP_TYPE: propname = "type"; (void) nvlist_lookup_uint32(lnvl, propname, &lv32); (void) nvlist_lookup_uint32(rnvl, propname, &rv32); if (rv32 != lv32) rc = (rv32 < lv32) ? 1 : -1; break; case ZFS_PROP_NAME: propname = "name"; if (numname) { compare_nums: (void) nvlist_lookup_uint64(lnvl, propname, &lv64); (void) nvlist_lookup_uint64(rnvl, propname, &rv64); if (rv64 != lv64) rc = (rv64 < lv64) ? 1 : -1; } else { if ((nvlist_lookup_string(lnvl, propname, &lvstr) == ENOENT) || (nvlist_lookup_string(rnvl, propname, &rvstr) == ENOENT)) { goto compare_nums; } rc = strcmp(lvstr, rvstr); } break; case ZFS_PROP_USED: case ZFS_PROP_QUOTA: if (!us_populated) break; if (prop == ZFS_PROP_USED) propname = "used"; else propname = "quota"; (void) nvlist_lookup_uint64(lnvl, propname, &lv64); (void) nvlist_lookup_uint64(rnvl, propname, &rv64); if (rv64 != lv64) rc = (rv64 < lv64) ? 1 : -1; break; default: break; } if (rc != 0) { if (rc < 0) return (reverse ? 1 : -1); else return (reverse ? -1 : 1); } } /* * If entries still seem to be the same, check if they are of the same * type (smbentity is added only if we are doing SID to POSIX ID * translation where we can have duplicate type/name combinations). */ if (nvlist_lookup_boolean_value(lnvl, "smbentity", &lvb) == 0 && nvlist_lookup_boolean_value(rnvl, "smbentity", &rvb) == 0 && lvb != rvb) return (lvb < rvb ? -1 : 1); return (0); } static boolean_t zfs_prop_is_user(unsigned p) { return (p == ZFS_PROP_USERUSED || p == ZFS_PROP_USERQUOTA || p == ZFS_PROP_USEROBJUSED || p == ZFS_PROP_USEROBJQUOTA); } static boolean_t zfs_prop_is_group(unsigned p) { return (p == ZFS_PROP_GROUPUSED || p == ZFS_PROP_GROUPQUOTA || p == ZFS_PROP_GROUPOBJUSED || p == ZFS_PROP_GROUPOBJQUOTA); } static boolean_t zfs_prop_is_project(unsigned p) { return (p == ZFS_PROP_PROJECTUSED || p == ZFS_PROP_PROJECTQUOTA || p == ZFS_PROP_PROJECTOBJUSED || p == ZFS_PROP_PROJECTOBJQUOTA); } static inline const char * us_type2str(unsigned field_type) { switch (field_type) { case USTYPE_PSX_USR: return ("POSIX User"); case USTYPE_PSX_GRP: return ("POSIX Group"); case USTYPE_SMB_USR: return ("SMB User"); case USTYPE_SMB_GRP: return ("SMB Group"); case USTYPE_PROJ: return ("Project"); default: return ("Undefined"); } } static int userspace_cb(void *arg, const char *domain, uid_t rid, uint64_t space) { us_cbdata_t *cb = (us_cbdata_t *)arg; zfs_userquota_prop_t prop = cb->cb_prop; char *name = NULL; const char *propname; char sizebuf[32]; us_node_t *node; uu_avl_pool_t *avl_pool = cb->cb_avl_pool; uu_avl_t *avl = cb->cb_avl; uu_avl_index_t idx; nvlist_t *props; us_node_t *n; zfs_sort_column_t *sortcol = cb->cb_sortcol; unsigned type = 0; const char *typestr; size_t namelen; size_t typelen; size_t sizelen; int typeidx, nameidx, sizeidx; us_sort_info_t sortinfo = { sortcol, cb->cb_numname }; boolean_t smbentity = B_FALSE; if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) nomem(); node = safe_malloc(sizeof (us_node_t)); uu_avl_node_init(node, &node->usn_avlnode, avl_pool); node->usn_nvl = props; if (domain != NULL && domain[0] != '\0') { #ifdef HAVE_IDMAP /* SMB */ char sid[MAXNAMELEN + 32]; uid_t id; uint64_t classes; int err; directory_error_t e; smbentity = B_TRUE; (void) snprintf(sid, sizeof (sid), "%s-%u", domain, rid); if (prop == ZFS_PROP_GROUPUSED || prop == ZFS_PROP_GROUPQUOTA) { type = USTYPE_SMB_GRP; err = sid_to_id(sid, B_FALSE, &id); } else { type = USTYPE_SMB_USR; err = sid_to_id(sid, B_TRUE, &id); } if (err == 0) { rid = id; if (!cb->cb_sid2posix) { e = directory_name_from_sid(NULL, sid, &name, &classes); if (e != NULL) directory_error_free(e); if (name == NULL) name = sid; } } #else nvlist_free(props); free(node); return (-1); #endif /* HAVE_IDMAP */ } if (cb->cb_sid2posix || domain == NULL || domain[0] == '\0') { /* POSIX or -i */ if (zfs_prop_is_group(prop)) { type = USTYPE_PSX_GRP; if (!cb->cb_numname) { struct group *g; if ((g = getgrgid(rid)) != NULL) name = g->gr_name; } } else if (zfs_prop_is_user(prop)) { type = USTYPE_PSX_USR; if (!cb->cb_numname) { struct passwd *p; if ((p = getpwuid(rid)) != NULL) name = p->pw_name; } } else { type = USTYPE_PROJ; } } /* * Make sure that the type/name combination is unique when doing * SID to POSIX ID translation (hence changing the type from SMB to * POSIX). */ if (cb->cb_sid2posix && nvlist_add_boolean_value(props, "smbentity", smbentity) != 0) nomem(); /* Calculate/update width of TYPE field */ typestr = us_type2str(type); typelen = strlen(gettext(typestr)); typeidx = us_field_index("type"); if (typelen > cb->cb_width[typeidx]) cb->cb_width[typeidx] = typelen; if (nvlist_add_uint32(props, "type", type) != 0) nomem(); /* Calculate/update width of NAME field */ if ((cb->cb_numname && cb->cb_sid2posix) || name == NULL) { if (nvlist_add_uint64(props, "name", rid) != 0) nomem(); namelen = snprintf(NULL, 0, "%u", rid); } else { if (nvlist_add_string(props, "name", name) != 0) nomem(); namelen = strlen(name); } nameidx = us_field_index("name"); if (nameidx >= 0 && namelen > cb->cb_width[nameidx]) cb->cb_width[nameidx] = namelen; /* * Check if this type/name combination is in the list and update it; * otherwise add new node to the list. */ if ((n = uu_avl_find(avl, node, &sortinfo, &idx)) == NULL) { uu_avl_insert(avl, node, idx); } else { nvlist_free(props); free(node); node = n; props = node->usn_nvl; } /* Calculate/update width of USED/QUOTA fields */ if (cb->cb_nicenum) { if (prop == ZFS_PROP_USERUSED || prop == ZFS_PROP_GROUPUSED || prop == ZFS_PROP_USERQUOTA || prop == ZFS_PROP_GROUPQUOTA || prop == ZFS_PROP_PROJECTUSED || prop == ZFS_PROP_PROJECTQUOTA) { zfs_nicebytes(space, sizebuf, sizeof (sizebuf)); } else { zfs_nicenum(space, sizebuf, sizeof (sizebuf)); } } else { (void) snprintf(sizebuf, sizeof (sizebuf), "%llu", (u_longlong_t)space); } sizelen = strlen(sizebuf); if (prop == ZFS_PROP_USERUSED || prop == ZFS_PROP_GROUPUSED || prop == ZFS_PROP_PROJECTUSED) { propname = "used"; if (!nvlist_exists(props, "quota")) (void) nvlist_add_uint64(props, "quota", 0); } else if (prop == ZFS_PROP_USERQUOTA || prop == ZFS_PROP_GROUPQUOTA || prop == ZFS_PROP_PROJECTQUOTA) { propname = "quota"; if (!nvlist_exists(props, "used")) (void) nvlist_add_uint64(props, "used", 0); } else if (prop == ZFS_PROP_USEROBJUSED || prop == ZFS_PROP_GROUPOBJUSED || prop == ZFS_PROP_PROJECTOBJUSED) { propname = "objused"; if (!nvlist_exists(props, "objquota")) (void) nvlist_add_uint64(props, "objquota", 0); } else if (prop == ZFS_PROP_USEROBJQUOTA || prop == ZFS_PROP_GROUPOBJQUOTA || prop == ZFS_PROP_PROJECTOBJQUOTA) { propname = "objquota"; if (!nvlist_exists(props, "objused")) (void) nvlist_add_uint64(props, "objused", 0); } else { return (-1); } sizeidx = us_field_index(propname); if (sizeidx >= 0 && sizelen > cb->cb_width[sizeidx]) cb->cb_width[sizeidx] = sizelen; if (nvlist_add_uint64(props, propname, space) != 0) nomem(); return (0); } static void print_us_node(boolean_t scripted, boolean_t parsable, int *fields, int types, size_t *width, us_node_t *node) { nvlist_t *nvl = node->usn_nvl; char valstr[MAXNAMELEN]; boolean_t first = B_TRUE; int cfield = 0; int field; uint32_t ustype; /* Check type */ (void) nvlist_lookup_uint32(nvl, "type", &ustype); if (!(ustype & types)) return; while ((field = fields[cfield]) != USFIELD_LAST) { nvpair_t *nvp = NULL; data_type_t type; uint32_t val32 = -1; uint64_t val64 = -1; const char *strval = "-"; while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) if (strcmp(nvpair_name(nvp), us_field_names[field]) == 0) break; type = nvp == NULL ? DATA_TYPE_UNKNOWN : nvpair_type(nvp); switch (type) { case DATA_TYPE_UINT32: val32 = fnvpair_value_uint32(nvp); break; case DATA_TYPE_UINT64: val64 = fnvpair_value_uint64(nvp); break; case DATA_TYPE_STRING: strval = fnvpair_value_string(nvp); break; case DATA_TYPE_UNKNOWN: break; default: (void) fprintf(stderr, "invalid data type\n"); } switch (field) { case USFIELD_TYPE: if (type == DATA_TYPE_UINT32) strval = us_type2str(val32); break; case USFIELD_NAME: if (type == DATA_TYPE_UINT64) { (void) sprintf(valstr, "%llu", (u_longlong_t)val64); strval = valstr; } break; case USFIELD_USED: case USFIELD_QUOTA: if (type == DATA_TYPE_UINT64) { if (parsable) { (void) sprintf(valstr, "%llu", (u_longlong_t)val64); strval = valstr; } else if (field == USFIELD_QUOTA && val64 == 0) { strval = "none"; } else { zfs_nicebytes(val64, valstr, sizeof (valstr)); strval = valstr; } } break; case USFIELD_OBJUSED: case USFIELD_OBJQUOTA: if (type == DATA_TYPE_UINT64) { if (parsable) { (void) sprintf(valstr, "%llu", (u_longlong_t)val64); strval = valstr; } else if (field == USFIELD_OBJQUOTA && val64 == 0) { strval = "none"; } else { zfs_nicenum(val64, valstr, sizeof (valstr)); strval = valstr; } } break; } if (!first) { if (scripted) (void) putchar('\t'); else (void) fputs(" ", stdout); } if (scripted) (void) fputs(strval, stdout); else if (field == USFIELD_TYPE || field == USFIELD_NAME) (void) printf("%-*s", (int)width[field], strval); else (void) printf("%*s", (int)width[field], strval); first = B_FALSE; cfield++; } (void) putchar('\n'); } static void print_us(boolean_t scripted, boolean_t parsable, int *fields, int types, size_t *width, boolean_t rmnode, uu_avl_t *avl) { us_node_t *node; const char *col; int cfield = 0; int field; if (!scripted) { boolean_t first = B_TRUE; while ((field = fields[cfield]) != USFIELD_LAST) { col = gettext(us_field_hdr[field]); if (field == USFIELD_TYPE || field == USFIELD_NAME) { (void) printf(first ? "%-*s" : " %-*s", (int)width[field], col); } else { (void) printf(first ? "%*s" : " %*s", (int)width[field], col); } first = B_FALSE; cfield++; } (void) printf("\n"); } for (node = uu_avl_first(avl); node; node = uu_avl_next(avl, node)) { print_us_node(scripted, parsable, fields, types, width, node); if (rmnode) nvlist_free(node->usn_nvl); } } static int zfs_do_userspace(int argc, char **argv) { zfs_handle_t *zhp; zfs_userquota_prop_t p; uu_avl_pool_t *avl_pool; uu_avl_t *avl_tree; uu_avl_walk_t *walk; char *delim; char deffields[] = "type,name,used,quota,objused,objquota"; char *ofield = NULL; char *tfield = NULL; int cfield = 0; int fields[256]; int i; boolean_t scripted = B_FALSE; boolean_t prtnum = B_FALSE; boolean_t parsable = B_FALSE; boolean_t sid2posix = B_FALSE; int ret = 0; int c; zfs_sort_column_t *sortcol = NULL; int types = USTYPE_PSX_USR | USTYPE_SMB_USR; us_cbdata_t cb; us_node_t *node; us_node_t *rmnode; uu_list_pool_t *listpool; uu_list_t *list; uu_avl_index_t idx = 0; uu_list_index_t idx2 = 0; if (argc < 2) usage(B_FALSE); if (strcmp(argv[0], "groupspace") == 0) { /* Toggle default group types */ types = USTYPE_PSX_GRP | USTYPE_SMB_GRP; } else if (strcmp(argv[0], "projectspace") == 0) { types = USTYPE_PROJ; prtnum = B_TRUE; } while ((c = getopt(argc, argv, "nHpo:s:S:t:i")) != -1) { switch (c) { case 'n': if (types == USTYPE_PROJ) { (void) fprintf(stderr, gettext("invalid option 'n'\n")); usage(B_FALSE); } prtnum = B_TRUE; break; case 'H': scripted = B_TRUE; break; case 'p': parsable = B_TRUE; break; case 'o': ofield = optarg; break; case 's': case 'S': if (zfs_add_sort_column(&sortcol, optarg, c == 's' ? B_FALSE : B_TRUE) != 0) { (void) fprintf(stderr, gettext("invalid field '%s'\n"), optarg); usage(B_FALSE); } break; case 't': if (types == USTYPE_PROJ) { (void) fprintf(stderr, gettext("invalid option 't'\n")); usage(B_FALSE); } tfield = optarg; break; case 'i': if (types == USTYPE_PROJ) { (void) fprintf(stderr, gettext("invalid option 'i'\n")); usage(B_FALSE); } sid2posix = B_TRUE; break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); usage(B_FALSE); break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; if (argc < 1) { (void) fprintf(stderr, gettext("missing dataset name\n")); usage(B_FALSE); } if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } /* Use default output fields if not specified using -o */ if (ofield == NULL) ofield = deffields; do { if ((delim = strchr(ofield, ',')) != NULL) *delim = '\0'; if ((fields[cfield++] = us_field_index(ofield)) == -1) { (void) fprintf(stderr, gettext("invalid type '%s' " "for -o option\n"), ofield); return (-1); } if (delim != NULL) ofield = delim + 1; } while (delim != NULL); fields[cfield] = USFIELD_LAST; /* Override output types (-t option) */ if (tfield != NULL) { types = 0; do { boolean_t found = B_FALSE; if ((delim = strchr(tfield, ',')) != NULL) *delim = '\0'; for (i = 0; i < sizeof (us_type_bits) / sizeof (int); i++) { if (strcmp(tfield, us_type_names[i]) == 0) { found = B_TRUE; types |= us_type_bits[i]; break; } } if (!found) { (void) fprintf(stderr, gettext("invalid type " "'%s' for -t option\n"), tfield); return (-1); } if (delim != NULL) tfield = delim + 1; } while (delim != NULL); } if ((zhp = zfs_path_to_zhandle(g_zfs, argv[0], ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT)) == NULL) return (1); if (zfs_get_underlying_type(zhp) != ZFS_TYPE_FILESYSTEM) { (void) fprintf(stderr, gettext("operation is only applicable " "to filesystems and their snapshots\n")); zfs_close(zhp); return (1); } if ((avl_pool = uu_avl_pool_create("us_avl_pool", sizeof (us_node_t), offsetof(us_node_t, usn_avlnode), us_compare, UU_DEFAULT)) == NULL) nomem(); if ((avl_tree = uu_avl_create(avl_pool, NULL, UU_DEFAULT)) == NULL) nomem(); /* Always add default sorting columns */ (void) zfs_add_sort_column(&sortcol, "type", B_FALSE); (void) zfs_add_sort_column(&sortcol, "name", B_FALSE); cb.cb_sortcol = sortcol; cb.cb_numname = prtnum; cb.cb_nicenum = !parsable; cb.cb_avl_pool = avl_pool; cb.cb_avl = avl_tree; cb.cb_sid2posix = sid2posix; for (i = 0; i < USFIELD_LAST; i++) cb.cb_width[i] = strlen(gettext(us_field_hdr[i])); for (p = 0; p < ZFS_NUM_USERQUOTA_PROPS; p++) { if ((zfs_prop_is_user(p) && !(types & (USTYPE_PSX_USR | USTYPE_SMB_USR))) || (zfs_prop_is_group(p) && !(types & (USTYPE_PSX_GRP | USTYPE_SMB_GRP))) || (zfs_prop_is_project(p) && types != USTYPE_PROJ)) continue; cb.cb_prop = p; if ((ret = zfs_userspace(zhp, p, userspace_cb, &cb)) != 0) { zfs_close(zhp); return (ret); } } zfs_close(zhp); /* Sort the list */ if ((node = uu_avl_first(avl_tree)) == NULL) return (0); us_populated = B_TRUE; listpool = uu_list_pool_create("tmplist", sizeof (us_node_t), offsetof(us_node_t, usn_listnode), NULL, UU_DEFAULT); list = uu_list_create(listpool, NULL, UU_DEFAULT); uu_list_node_init(node, &node->usn_listnode, listpool); while (node != NULL) { rmnode = node; node = uu_avl_next(avl_tree, node); uu_avl_remove(avl_tree, rmnode); if (uu_list_find(list, rmnode, NULL, &idx2) == NULL) uu_list_insert(list, rmnode, idx2); } for (node = uu_list_first(list); node != NULL; node = uu_list_next(list, node)) { us_sort_info_t sortinfo = { sortcol, cb.cb_numname }; if (uu_avl_find(avl_tree, node, &sortinfo, &idx) == NULL) uu_avl_insert(avl_tree, node, idx); } uu_list_destroy(list); uu_list_pool_destroy(listpool); /* Print and free node nvlist memory */ print_us(scripted, parsable, fields, types, cb.cb_width, B_TRUE, cb.cb_avl); zfs_free_sort_columns(sortcol); /* Clean up the AVL tree */ if ((walk = uu_avl_walk_start(cb.cb_avl, UU_WALK_ROBUST)) == NULL) nomem(); while ((node = uu_avl_walk_next(walk)) != NULL) { uu_avl_remove(cb.cb_avl, node); free(node); } uu_avl_walk_end(walk); uu_avl_destroy(avl_tree); uu_avl_pool_destroy(avl_pool); return (ret); } /* * list [-Hp][-r|-d max] [-o property[,...]] [-s property] ... [-S property] * [-t type[,...]] [filesystem|volume|snapshot] ... * * -H Scripted mode; elide headers and separate columns by tabs * -p Display values in parsable (literal) format. * -r Recurse over all children * -d Limit recursion by depth. * -o Control which fields to display. * -s Specify sort columns, descending order. * -S Specify sort columns, ascending order. * -t Control which object types to display. * * When given no arguments, list all filesystems in the system. * Otherwise, list the specified datasets, optionally recursing down them if * '-r' is specified. */ typedef struct list_cbdata { boolean_t cb_first; boolean_t cb_literal; boolean_t cb_scripted; zprop_list_t *cb_proplist; } list_cbdata_t; /* * Given a list of columns to display, output appropriate headers for each one. */ static void print_header(list_cbdata_t *cb) { zprop_list_t *pl = cb->cb_proplist; char headerbuf[ZFS_MAXPROPLEN]; const char *header; int i; boolean_t first = B_TRUE; boolean_t right_justify; for (; pl != NULL; pl = pl->pl_next) { if (!first) { (void) printf(" "); } else { first = B_FALSE; } right_justify = B_FALSE; if (pl->pl_prop != ZPROP_USERPROP) { header = zfs_prop_column_name(pl->pl_prop); right_justify = zfs_prop_align_right(pl->pl_prop); } else { for (i = 0; pl->pl_user_prop[i] != '\0'; i++) headerbuf[i] = toupper(pl->pl_user_prop[i]); headerbuf[i] = '\0'; header = headerbuf; } if (pl->pl_next == NULL && !right_justify) (void) printf("%s", header); else if (right_justify) (void) printf("%*s", (int)pl->pl_width, header); else (void) printf("%-*s", (int)pl->pl_width, header); } (void) printf("\n"); } /* * Given a dataset and a list of fields, print out all the properties according * to the described layout. */ static void print_dataset(zfs_handle_t *zhp, list_cbdata_t *cb) { zprop_list_t *pl = cb->cb_proplist; boolean_t first = B_TRUE; char property[ZFS_MAXPROPLEN]; nvlist_t *userprops = zfs_get_user_props(zhp); nvlist_t *propval; const char *propstr; boolean_t right_justify; for (; pl != NULL; pl = pl->pl_next) { if (!first) { if (cb->cb_scripted) (void) putchar('\t'); else (void) fputs(" ", stdout); } else { first = B_FALSE; } if (pl->pl_prop == ZFS_PROP_NAME) { (void) strlcpy(property, zfs_get_name(zhp), sizeof (property)); propstr = property; right_justify = zfs_prop_align_right(pl->pl_prop); } else if (pl->pl_prop != ZPROP_USERPROP) { if (zfs_prop_get(zhp, pl->pl_prop, property, sizeof (property), NULL, NULL, 0, cb->cb_literal) != 0) propstr = "-"; else propstr = property; right_justify = zfs_prop_align_right(pl->pl_prop); } else if (zfs_prop_userquota(pl->pl_user_prop)) { if (zfs_prop_get_userquota(zhp, pl->pl_user_prop, property, sizeof (property), cb->cb_literal) != 0) propstr = "-"; else propstr = property; right_justify = B_TRUE; } else if (zfs_prop_written(pl->pl_user_prop)) { if (zfs_prop_get_written(zhp, pl->pl_user_prop, property, sizeof (property), cb->cb_literal) != 0) propstr = "-"; else propstr = property; right_justify = B_TRUE; } else { if (nvlist_lookup_nvlist(userprops, pl->pl_user_prop, &propval) != 0) propstr = "-"; else propstr = fnvlist_lookup_string(propval, ZPROP_VALUE); right_justify = B_FALSE; } /* * If this is being called in scripted mode, or if this is the * last column and it is left-justified, don't include a width * format specifier. */ if (cb->cb_scripted || (pl->pl_next == NULL && !right_justify)) (void) fputs(propstr, stdout); else if (right_justify) (void) printf("%*s", (int)pl->pl_width, propstr); else (void) printf("%-*s", (int)pl->pl_width, propstr); } (void) putchar('\n'); } /* * Generic callback function to list a dataset or snapshot. */ static int list_callback(zfs_handle_t *zhp, void *data) { list_cbdata_t *cbp = data; if (cbp->cb_first) { if (!cbp->cb_scripted) print_header(cbp); cbp->cb_first = B_FALSE; } print_dataset(zhp, cbp); return (0); } static int zfs_do_list(int argc, char **argv) { int c; char default_fields[] = "name,used,available,referenced,mountpoint"; int types = ZFS_TYPE_DATASET; boolean_t types_specified = B_FALSE; char *fields = default_fields; list_cbdata_t cb = { 0 }; int limit = 0; int ret = 0; zfs_sort_column_t *sortcol = NULL; int flags = ZFS_ITER_PROP_LISTSNAPS | ZFS_ITER_ARGS_CAN_BE_PATHS; /* check options */ while ((c = getopt(argc, argv, "HS:d:o:prs:t:")) != -1) { switch (c) { case 'o': fields = optarg; break; case 'p': cb.cb_literal = B_TRUE; flags |= ZFS_ITER_LITERAL_PROPS; break; case 'd': limit = parse_depth(optarg, &flags); break; case 'r': flags |= ZFS_ITER_RECURSE; break; case 'H': cb.cb_scripted = B_TRUE; break; case 's': if (zfs_add_sort_column(&sortcol, optarg, B_FALSE) != 0) { (void) fprintf(stderr, gettext("invalid property '%s'\n"), optarg); usage(B_FALSE); } break; case 'S': if (zfs_add_sort_column(&sortcol, optarg, B_TRUE) != 0) { (void) fprintf(stderr, gettext("invalid property '%s'\n"), optarg); usage(B_FALSE); } break; case 't': types = 0; types_specified = B_TRUE; flags &= ~ZFS_ITER_PROP_LISTSNAPS; for (char *tok; (tok = strsep(&optarg, ",")); ) { static const char *const type_subopts[] = { "filesystem", "volume", "snapshot", "snap", "bookmark", "all" }; static const int type_types[] = { ZFS_TYPE_FILESYSTEM, ZFS_TYPE_VOLUME, ZFS_TYPE_SNAPSHOT, ZFS_TYPE_SNAPSHOT, ZFS_TYPE_BOOKMARK, ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK }; for (c = 0; c < ARRAY_SIZE(type_subopts); ++c) if (strcmp(tok, type_subopts[c]) == 0) { types |= type_types[c]; goto found3; } (void) fprintf(stderr, gettext("invalid type '%s'\n"), tok); usage(B_FALSE); found3:; } break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); usage(B_FALSE); break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* * If we are only going to list snapshot names and sort by name or * by createtxg, then we can use faster version. */ if (strcmp(fields, "name") == 0 && (zfs_sort_only_by_name(sortcol) || zfs_sort_only_by_createtxg(sortcol))) { flags |= ZFS_ITER_SIMPLE; } /* * If "-o space" and no types were specified, don't display snapshots. */ if (strcmp(fields, "space") == 0 && types_specified == B_FALSE) types &= ~ZFS_TYPE_SNAPSHOT; /* * Handle users who want to list all snapshots or bookmarks * of the current dataset (ex. 'zfs list -t snapshot '). */ if ((types == ZFS_TYPE_SNAPSHOT || types == ZFS_TYPE_BOOKMARK) && argc > 0 && (flags & ZFS_ITER_RECURSE) == 0 && limit == 0) { flags |= (ZFS_ITER_DEPTH_LIMIT | ZFS_ITER_RECURSE); limit = 1; } /* * If the user specifies '-o all', the zprop_get_list() doesn't * normally include the name of the dataset. For 'zfs list', we always * want this property to be first. */ if (zprop_get_list(g_zfs, fields, &cb.cb_proplist, ZFS_TYPE_DATASET) != 0) usage(B_FALSE); cb.cb_first = B_TRUE; ret = zfs_for_each(argc, argv, flags, types, sortcol, &cb.cb_proplist, limit, list_callback, &cb); zprop_free_list(cb.cb_proplist); zfs_free_sort_columns(sortcol); if (ret == 0 && cb.cb_first && !cb.cb_scripted) (void) fprintf(stderr, gettext("no datasets available\n")); return (ret); } /* * zfs rename [-fu] * zfs rename [-f] -p * zfs rename [-u] -r * * Renames the given dataset to another of the same type. * * The '-p' flag creates all the non-existing ancestors of the target first. * The '-u' flag prevents file systems from being remounted during rename. */ static int zfs_do_rename(int argc, char **argv) { zfs_handle_t *zhp; renameflags_t flags = { 0 }; int c; int ret = 0; int types; boolean_t parents = B_FALSE; /* check options */ while ((c = getopt(argc, argv, "pruf")) != -1) { switch (c) { case 'p': parents = B_TRUE; break; case 'r': flags.recursive = B_TRUE; break; case 'u': flags.nounmount = B_TRUE; break; case 'f': flags.forceunmount = B_TRUE; break; case '?': default: (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing source dataset " "argument\n")); usage(B_FALSE); } if (argc < 2) { (void) fprintf(stderr, gettext("missing target dataset " "argument\n")); usage(B_FALSE); } if (argc > 2) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } if (flags.recursive && parents) { (void) fprintf(stderr, gettext("-p and -r options are mutually " "exclusive\n")); usage(B_FALSE); } if (flags.nounmount && parents) { (void) fprintf(stderr, gettext("-u and -p options are mutually " "exclusive\n")); usage(B_FALSE); } if (flags.recursive && strchr(argv[0], '@') == 0) { (void) fprintf(stderr, gettext("source dataset for recursive " "rename must be a snapshot\n")); usage(B_FALSE); } if (flags.nounmount) types = ZFS_TYPE_FILESYSTEM; else if (parents) types = ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME; else types = ZFS_TYPE_DATASET; if ((zhp = zfs_open(g_zfs, argv[0], types)) == NULL) return (1); /* If we were asked and the name looks good, try to create ancestors. */ if (parents && zfs_name_valid(argv[1], zfs_get_type(zhp)) && zfs_create_ancestors(g_zfs, argv[1]) != 0) { zfs_close(zhp); return (1); } ret = (zfs_rename(zhp, argv[1], flags) != 0); zfs_close(zhp); return (ret); } /* * zfs promote * * Promotes the given clone fs to be the parent */ static int zfs_do_promote(int argc, char **argv) { zfs_handle_t *zhp; int ret = 0; /* check options */ if (argc > 1 && argv[1][0] == '-') { (void) fprintf(stderr, gettext("invalid option '%c'\n"), argv[1][1]); usage(B_FALSE); } /* check number of arguments */ if (argc < 2) { (void) fprintf(stderr, gettext("missing clone filesystem" " argument\n")); usage(B_FALSE); } if (argc > 2) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } zhp = zfs_open(g_zfs, argv[1], ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); if (zhp == NULL) return (1); ret = (zfs_promote(zhp) != 0); zfs_close(zhp); return (ret); } static int zfs_do_redact(int argc, char **argv) { char *snap = NULL; char *bookname = NULL; char **rsnaps = NULL; int numrsnaps = 0; argv++; argc--; if (argc < 3) { (void) fprintf(stderr, gettext("too few arguments\n")); usage(B_FALSE); } snap = argv[0]; bookname = argv[1]; rsnaps = argv + 2; numrsnaps = argc - 2; nvlist_t *rsnapnv = fnvlist_alloc(); for (int i = 0; i < numrsnaps; i++) { fnvlist_add_boolean(rsnapnv, rsnaps[i]); } int err = lzc_redact(snap, bookname, rsnapnv); fnvlist_free(rsnapnv); switch (err) { case 0: break; case ENOENT: (void) fprintf(stderr, gettext("provided snapshot %s does not exist\n"), snap); break; case EEXIST: (void) fprintf(stderr, gettext("specified redaction bookmark " "(%s) provided already exists\n"), bookname); break; case ENAMETOOLONG: (void) fprintf(stderr, gettext("provided bookmark name cannot " "be used, final name would be too long\n")); break; case E2BIG: (void) fprintf(stderr, gettext("too many redaction snapshots " "specified\n")); break; case EINVAL: if (strchr(bookname, '#') != NULL) (void) fprintf(stderr, gettext( "redaction bookmark name must not contain '#'\n")); else (void) fprintf(stderr, gettext( "redaction snapshot must be descendent of " "snapshot being redacted\n")); break; case EALREADY: (void) fprintf(stderr, gettext("attempted to redact redacted " "dataset or with respect to redacted dataset\n")); break; case ENOTSUP: (void) fprintf(stderr, gettext("redaction bookmarks feature " "not enabled\n")); break; case EXDEV: (void) fprintf(stderr, gettext("potentially invalid redaction " "snapshot; full dataset names required\n")); break; default: (void) fprintf(stderr, gettext("internal error: %s\n"), strerror(errno)); } return (err); } /* * zfs rollback [-rRf] * * -r Delete any intervening snapshots before doing rollback * -R Delete any snapshots and their clones * -f ignored for backwards compatibility * * Given a filesystem, rollback to a specific snapshot, discarding any changes * since then and making it the active dataset. If more recent snapshots exist, * the command will complain unless the '-r' flag is given. */ typedef struct rollback_cbdata { uint64_t cb_create; uint8_t cb_younger_ds_printed; boolean_t cb_first; int cb_doclones; char *cb_target; int cb_error; boolean_t cb_recurse; } rollback_cbdata_t; static int rollback_check_dependent(zfs_handle_t *zhp, void *data) { rollback_cbdata_t *cbp = data; if (cbp->cb_first && cbp->cb_recurse) { (void) fprintf(stderr, gettext("cannot rollback to " "'%s': clones of previous snapshots exist\n"), cbp->cb_target); (void) fprintf(stderr, gettext("use '-R' to " "force deletion of the following clones and " "dependents:\n")); cbp->cb_first = 0; cbp->cb_error = 1; } (void) fprintf(stderr, "%s\n", zfs_get_name(zhp)); zfs_close(zhp); return (0); } /* * Report some snapshots/bookmarks more recent than the one specified. * Used when '-r' is not specified. We reuse this same callback for the * snapshot dependents - if 'cb_dependent' is set, then this is a * dependent and we should report it without checking the transaction group. */ static int rollback_check(zfs_handle_t *zhp, void *data) { rollback_cbdata_t *cbp = data; /* * Max number of younger snapshots and/or bookmarks to display before * we stop the iteration. */ const uint8_t max_younger = 32; if (cbp->cb_doclones) { zfs_close(zhp); return (0); } if (zfs_prop_get_int(zhp, ZFS_PROP_CREATETXG) > cbp->cb_create) { if (cbp->cb_first && !cbp->cb_recurse) { (void) fprintf(stderr, gettext("cannot " "rollback to '%s': more recent snapshots " "or bookmarks exist\n"), cbp->cb_target); (void) fprintf(stderr, gettext("use '-r' to " "force deletion of the following " "snapshots and bookmarks:\n")); cbp->cb_first = 0; cbp->cb_error = 1; } if (cbp->cb_recurse) { if (zfs_iter_dependents(zhp, B_TRUE, rollback_check_dependent, cbp) != 0) { zfs_close(zhp); return (-1); } } else { (void) fprintf(stderr, "%s\n", zfs_get_name(zhp)); cbp->cb_younger_ds_printed++; } } zfs_close(zhp); if (cbp->cb_younger_ds_printed == max_younger) { /* * This non-recursive rollback is going to fail due to the * presence of snapshots and/or bookmarks that are younger than * the rollback target. * We printed some of the offending objects, now we stop * zfs_iter_snapshot/bookmark iteration so we can fail fast and * avoid iterating over the rest of the younger objects */ (void) fprintf(stderr, gettext("Output limited to %d " "snapshots/bookmarks\n"), max_younger); return (-1); } return (0); } static int zfs_do_rollback(int argc, char **argv) { int ret = 0; int c; boolean_t force = B_FALSE; rollback_cbdata_t cb = { 0 }; zfs_handle_t *zhp, *snap; char parentname[ZFS_MAX_DATASET_NAME_LEN]; char *delim; uint64_t min_txg = 0; /* check options */ while ((c = getopt(argc, argv, "rRf")) != -1) { switch (c) { case 'r': cb.cb_recurse = 1; break; case 'R': cb.cb_recurse = 1; cb.cb_doclones = 1; break; case 'f': force = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing dataset argument\n")); usage(B_FALSE); } if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } /* open the snapshot */ if ((snap = zfs_open(g_zfs, argv[0], ZFS_TYPE_SNAPSHOT)) == NULL) return (1); /* open the parent dataset */ (void) strlcpy(parentname, argv[0], sizeof (parentname)); verify((delim = strrchr(parentname, '@')) != NULL); *delim = '\0'; if ((zhp = zfs_open(g_zfs, parentname, ZFS_TYPE_DATASET)) == NULL) { zfs_close(snap); return (1); } /* * Check for more recent snapshots and/or clones based on the presence * of '-r' and '-R'. */ cb.cb_target = argv[0]; cb.cb_create = zfs_prop_get_int(snap, ZFS_PROP_CREATETXG); cb.cb_first = B_TRUE; cb.cb_error = 0; if (cb.cb_create > 0) min_txg = cb.cb_create; if ((ret = zfs_iter_snapshots(zhp, B_FALSE, rollback_check, &cb, min_txg, 0)) != 0) goto out; if ((ret = zfs_iter_bookmarks(zhp, rollback_check, &cb)) != 0) goto out; if ((ret = cb.cb_error) != 0) goto out; /* * Rollback parent to the given snapshot. */ ret = zfs_rollback(zhp, snap, force); out: zfs_close(snap); zfs_close(zhp); if (ret == 0) return (0); else return (1); } /* * zfs set property=value ... { fs | snap | vol } ... * * Sets the given properties for all datasets specified on the command line. */ static int set_callback(zfs_handle_t *zhp, void *data) { nvlist_t *props = data; if (zfs_prop_set_list(zhp, props) != 0) { switch (libzfs_errno(g_zfs)) { case EZFS_MOUNTFAILED: (void) fprintf(stderr, gettext("property may be set " "but unable to remount filesystem\n")); break; case EZFS_SHARENFSFAILED: (void) fprintf(stderr, gettext("property may be set " "but unable to reshare filesystem\n")); break; } return (1); } return (0); } static int zfs_do_set(int argc, char **argv) { nvlist_t *props = NULL; int ds_start = -1; /* argv idx of first dataset arg */ int ret = 0; int i; /* check for options */ if (argc > 1 && argv[1][0] == '-') { (void) fprintf(stderr, gettext("invalid option '%c'\n"), argv[1][1]); usage(B_FALSE); } /* check number of arguments */ if (argc < 2) { (void) fprintf(stderr, gettext("missing arguments\n")); usage(B_FALSE); } if (argc < 3) { if (strchr(argv[1], '=') == NULL) { (void) fprintf(stderr, gettext("missing property=value " "argument(s)\n")); } else { (void) fprintf(stderr, gettext("missing dataset " "name(s)\n")); } usage(B_FALSE); } /* validate argument order: prop=val args followed by dataset args */ for (i = 1; i < argc; i++) { if (strchr(argv[i], '=') != NULL) { if (ds_start > 0) { /* out-of-order prop=val argument */ (void) fprintf(stderr, gettext("invalid " "argument order\n")); usage(B_FALSE); } } else if (ds_start < 0) { ds_start = i; } } if (ds_start < 0) { (void) fprintf(stderr, gettext("missing dataset name(s)\n")); usage(B_FALSE); } /* Populate a list of property settings */ if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) nomem(); for (i = 1; i < ds_start; i++) { if (!parseprop(props, argv[i])) { ret = -1; goto error; } } ret = zfs_for_each(argc - ds_start, argv + ds_start, 0, ZFS_TYPE_DATASET, NULL, NULL, 0, set_callback, props); error: nvlist_free(props); return (ret); } typedef struct snap_cbdata { nvlist_t *sd_nvl; boolean_t sd_recursive; const char *sd_snapname; } snap_cbdata_t; static int zfs_snapshot_cb(zfs_handle_t *zhp, void *arg) { snap_cbdata_t *sd = arg; char *name; int rv = 0; int error; if (sd->sd_recursive && zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT) != 0) { zfs_close(zhp); return (0); } error = asprintf(&name, "%s@%s", zfs_get_name(zhp), sd->sd_snapname); if (error == -1) nomem(); fnvlist_add_boolean(sd->sd_nvl, name); free(name); if (sd->sd_recursive) rv = zfs_iter_filesystems(zhp, zfs_snapshot_cb, sd); zfs_close(zhp); return (rv); } /* * zfs snapshot [-r] [-o prop=value] ... * * Creates a snapshot with the given name. While functionally equivalent to * 'zfs create', it is a separate command to differentiate intent. */ static int zfs_do_snapshot(int argc, char **argv) { int ret = 0; int c; nvlist_t *props; snap_cbdata_t sd = { 0 }; boolean_t multiple_snaps = B_FALSE; if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) nomem(); if (nvlist_alloc(&sd.sd_nvl, NV_UNIQUE_NAME, 0) != 0) nomem(); /* check options */ while ((c = getopt(argc, argv, "ro:")) != -1) { switch (c) { case 'o': if (!parseprop(props, optarg)) { nvlist_free(sd.sd_nvl); nvlist_free(props); return (1); } break; case 'r': sd.sd_recursive = B_TRUE; multiple_snaps = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); goto usage; } } argc -= optind; argv += optind; /* check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing snapshot argument\n")); goto usage; } if (argc > 1) multiple_snaps = B_TRUE; for (; argc > 0; argc--, argv++) { char *atp; zfs_handle_t *zhp; atp = strchr(argv[0], '@'); if (atp == NULL) goto usage; *atp = '\0'; sd.sd_snapname = atp + 1; zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); if (zhp == NULL) goto usage; if (zfs_snapshot_cb(zhp, &sd) != 0) goto usage; } ret = zfs_snapshot_nvl(g_zfs, sd.sd_nvl, props); nvlist_free(sd.sd_nvl); nvlist_free(props); if (ret != 0 && multiple_snaps) (void) fprintf(stderr, gettext("no snapshots were created\n")); return (ret != 0); usage: nvlist_free(sd.sd_nvl); nvlist_free(props); usage(B_FALSE); return (-1); } /* * Array of prefixes to exclude – * a linear search, even if executed for each dataset, * is plenty good enough. */ typedef struct zfs_send_exclude_arg { size_t count; const char **list; } zfs_send_exclude_arg_t; static boolean_t zfs_do_send_exclude(zfs_handle_t *zhp, void *context) { zfs_send_exclude_arg_t *excludes = context; const char *name = zfs_get_name(zhp); for (size_t i = 0; i < excludes->count; ++i) { size_t len = strlen(excludes->list[i]); if (strncmp(name, excludes->list[i], len) == 0 && memchr("/@", name[len], sizeof ("/@"))) return (B_FALSE); } return (B_TRUE); } /* * Send a backup stream to stdout. */ static int zfs_do_send(int argc, char **argv) { char *fromname = NULL; char *toname = NULL; char *resume_token = NULL; char *cp; zfs_handle_t *zhp; sendflags_t flags = { 0 }; int c, err; nvlist_t *dbgnv = NULL; char *redactbook = NULL; zfs_send_exclude_arg_t excludes = { 0 }; struct option long_options[] = { {"replicate", no_argument, NULL, 'R'}, {"skip-missing", no_argument, NULL, 's'}, {"redact", required_argument, NULL, 'd'}, {"props", no_argument, NULL, 'p'}, {"parsable", no_argument, NULL, 'P'}, {"dedup", no_argument, NULL, 'D'}, {"verbose", no_argument, NULL, 'v'}, {"dryrun", no_argument, NULL, 'n'}, {"large-block", no_argument, NULL, 'L'}, {"embed", no_argument, NULL, 'e'}, {"resume", required_argument, NULL, 't'}, {"compressed", no_argument, NULL, 'c'}, {"raw", no_argument, NULL, 'w'}, {"backup", no_argument, NULL, 'b'}, {"holds", no_argument, NULL, 'h'}, {"saved", no_argument, NULL, 'S'}, {"exclude", required_argument, NULL, 'X'}, {0, 0, 0, 0} }; /* check options */ while ((c = getopt_long(argc, argv, ":i:I:RsDpvnPLeht:cwbd:SX:", long_options, NULL)) != -1) { switch (c) { case 'X': for (char *ds; (ds = strsep(&optarg, ",")) != NULL; ) { if (!zfs_name_valid(ds, ZFS_TYPE_DATASET) || strchr(ds, '/') == NULL) { (void) fprintf(stderr, gettext("-X %s: " "not a valid non-root dataset name" ".\n"), ds); usage(B_FALSE); } excludes.list = safe_realloc(excludes.list, sizeof (char *) * (excludes.count + 1)); excludes.list[excludes.count++] = ds; } break; case 'i': if (fromname) usage(B_FALSE); fromname = optarg; break; case 'I': if (fromname) usage(B_FALSE); fromname = optarg; flags.doall = B_TRUE; break; case 'R': flags.replicate = B_TRUE; break; case 's': flags.skipmissing = B_TRUE; break; case 'd': redactbook = optarg; break; case 'p': flags.props = B_TRUE; break; case 'b': flags.backup = B_TRUE; break; case 'h': flags.holds = B_TRUE; break; case 'P': flags.parsable = B_TRUE; break; case 'v': flags.verbosity++; flags.progress = B_TRUE; break; case 'D': (void) fprintf(stderr, gettext("WARNING: deduplicated send is no " "longer supported. A regular,\n" "non-deduplicated stream will be generated.\n\n")); break; case 'n': flags.dryrun = B_TRUE; break; case 'L': flags.largeblock = B_TRUE; break; case 'e': flags.embed_data = B_TRUE; break; case 't': resume_token = optarg; break; case 'c': flags.compress = B_TRUE; break; case 'w': flags.raw = B_TRUE; flags.compress = B_TRUE; flags.embed_data = B_TRUE; flags.largeblock = B_TRUE; break; case 'S': flags.saved = B_TRUE; break; case ':': /* * If a parameter was not passed, optopt contains the * value that would normally lead us into the * appropriate case statement. If it's > 256, then this * must be a longopt and we should look at argv to get * the string. Otherwise it's just the character, so we * should use it directly. */ if (optopt <= UINT8_MAX) { (void) fprintf(stderr, gettext("missing argument for '%c' " "option\n"), optopt); } else { (void) fprintf(stderr, gettext("missing argument for '%s' " "option\n"), argv[optind - 1]); } usage(B_FALSE); break; case '?': default: /* * If an invalid flag was passed, optopt contains the * character if it was a short flag, or 0 if it was a * longopt. */ if (optopt != 0) { (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); } else { (void) fprintf(stderr, gettext("invalid option '%s'\n"), argv[optind - 1]); } usage(B_FALSE); } } if (flags.parsable && flags.verbosity == 0) flags.verbosity = 1; if (excludes.count > 0 && !flags.replicate) { (void) fprintf(stderr, gettext("Cannot specify " "dataset exclusion (-X) on a non-recursive " "send.\n")); return (1); } argc -= optind; argv += optind; if (resume_token != NULL) { if (fromname != NULL || flags.replicate || flags.props || flags.backup || flags.holds || flags.saved || redactbook != NULL) { (void) fprintf(stderr, gettext("invalid flags combined with -t\n")); usage(B_FALSE); } if (argc > 0) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } } else { if (argc < 1) { (void) fprintf(stderr, gettext("missing snapshot argument\n")); usage(B_FALSE); } if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } } if (flags.saved) { if (fromname != NULL || flags.replicate || flags.props || flags.doall || flags.backup || flags.holds || flags.largeblock || flags.embed_data || flags.compress || flags.raw || redactbook != NULL) { (void) fprintf(stderr, gettext("incompatible flags " "combined with saved send flag\n")); usage(B_FALSE); } if (strchr(argv[0], '@') != NULL) { (void) fprintf(stderr, gettext("saved send must " "specify the dataset with partially-received " "state\n")); usage(B_FALSE); } } if (flags.raw && redactbook != NULL) { (void) fprintf(stderr, gettext("Error: raw sends may not be redacted.\n")); return (1); } if (!flags.dryrun && isatty(STDOUT_FILENO)) { (void) fprintf(stderr, gettext("Error: Stream can not be written to a terminal.\n" "You must redirect standard output.\n")); return (1); } if (flags.saved) { zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_DATASET); if (zhp == NULL) return (1); err = zfs_send_saved(zhp, &flags, STDOUT_FILENO, resume_token); zfs_close(zhp); return (err != 0); } else if (resume_token != NULL) { return (zfs_send_resume(g_zfs, &flags, STDOUT_FILENO, resume_token)); } if (flags.skipmissing && !flags.replicate) { (void) fprintf(stderr, gettext("skip-missing flag can only be used in " "conjunction with replicate\n")); usage(B_FALSE); } /* * For everything except -R and -I, use the new, cleaner code path. */ if (!(flags.replicate || flags.doall)) { char frombuf[ZFS_MAX_DATASET_NAME_LEN]; if (fromname != NULL && (strchr(fromname, '#') == NULL && strchr(fromname, '@') == NULL)) { /* * Neither bookmark or snapshot was specified. Print a * warning, and assume snapshot. */ (void) fprintf(stderr, "Warning: incremental source " "didn't specify type, assuming snapshot. Use '@' " "or '#' prefix to avoid ambiguity.\n"); (void) snprintf(frombuf, sizeof (frombuf), "@%s", fromname); fromname = frombuf; } if (fromname != NULL && (fromname[0] == '#' || fromname[0] == '@')) { /* * Incremental source name begins with # or @. * Default to same fs as target. */ char tmpbuf[ZFS_MAX_DATASET_NAME_LEN]; (void) strlcpy(tmpbuf, fromname, sizeof (tmpbuf)); (void) strlcpy(frombuf, argv[0], sizeof (frombuf)); cp = strchr(frombuf, '@'); if (cp != NULL) *cp = '\0'; (void) strlcat(frombuf, tmpbuf, sizeof (frombuf)); fromname = frombuf; } zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_DATASET); if (zhp == NULL) return (1); err = zfs_send_one(zhp, fromname, STDOUT_FILENO, &flags, redactbook); zfs_close(zhp); return (err != 0); } if (fromname != NULL && strchr(fromname, '#')) { (void) fprintf(stderr, gettext("Error: multiple snapshots cannot be " "sent from a bookmark.\n")); return (1); } if (redactbook != NULL) { (void) fprintf(stderr, gettext("Error: multiple snapshots " "cannot be sent redacted.\n")); return (1); } if ((cp = strchr(argv[0], '@')) == NULL) { (void) fprintf(stderr, gettext("Error: " "Unsupported flag with filesystem or bookmark.\n")); return (1); } *cp = '\0'; toname = cp + 1; zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); if (zhp == NULL) return (1); /* * If they specified the full path to the snapshot, chop off * everything except the short name of the snapshot, but special * case if they specify the origin. */ if (fromname && (cp = strchr(fromname, '@')) != NULL) { char origin[ZFS_MAX_DATASET_NAME_LEN]; zprop_source_t src; (void) zfs_prop_get(zhp, ZFS_PROP_ORIGIN, origin, sizeof (origin), &src, NULL, 0, B_FALSE); if (strcmp(origin, fromname) == 0) { fromname = NULL; flags.fromorigin = B_TRUE; } else { *cp = '\0'; if (cp != fromname && strcmp(argv[0], fromname)) { (void) fprintf(stderr, gettext("incremental source must be " "in same filesystem\n")); usage(B_FALSE); } fromname = cp + 1; if (strchr(fromname, '@') || strchr(fromname, '/')) { (void) fprintf(stderr, gettext("invalid incremental source\n")); usage(B_FALSE); } } } if (flags.replicate && fromname == NULL) flags.doall = B_TRUE; err = zfs_send(zhp, fromname, toname, &flags, STDOUT_FILENO, excludes.count > 0 ? zfs_do_send_exclude : NULL, &excludes, flags.verbosity >= 3 ? &dbgnv : NULL); if (flags.verbosity >= 3 && dbgnv != NULL) { /* * dump_nvlist prints to stdout, but that's been * redirected to a file. Make it print to stderr * instead. */ (void) dup2(STDERR_FILENO, STDOUT_FILENO); dump_nvlist(dbgnv, 0); nvlist_free(dbgnv); } zfs_close(zhp); free(excludes.list); return (err != 0); } /* * Restore a backup stream from stdin. */ static int zfs_do_receive(int argc, char **argv) { int c, err = 0; recvflags_t flags = { 0 }; boolean_t abort_resumable = B_FALSE; nvlist_t *props; if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) nomem(); /* check options */ while ((c = getopt(argc, argv, ":o:x:dehMnuvFsAc")) != -1) { switch (c) { case 'o': if (!parseprop(props, optarg)) { nvlist_free(props); usage(B_FALSE); } break; case 'x': if (!parsepropname(props, optarg)) { nvlist_free(props); usage(B_FALSE); } break; case 'd': if (flags.istail) { (void) fprintf(stderr, gettext("invalid option " "combination: -d and -e are mutually " "exclusive\n")); usage(B_FALSE); } flags.isprefix = B_TRUE; break; case 'e': if (flags.isprefix) { (void) fprintf(stderr, gettext("invalid option " "combination: -d and -e are mutually " "exclusive\n")); usage(B_FALSE); } flags.istail = B_TRUE; break; case 'h': flags.skipholds = B_TRUE; break; case 'M': flags.forceunmount = B_TRUE; break; case 'n': flags.dryrun = B_TRUE; break; case 'u': flags.nomount = B_TRUE; break; case 'v': flags.verbose = B_TRUE; break; case 's': flags.resumable = B_TRUE; break; case 'F': flags.force = B_TRUE; break; case 'A': abort_resumable = B_TRUE; break; case 'c': flags.heal = B_TRUE; break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); usage(B_FALSE); break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* zfs recv -e (use "tail" name) implies -d (remove dataset "head") */ if (flags.istail) flags.isprefix = B_TRUE; /* check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing snapshot argument\n")); usage(B_FALSE); } if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } if (abort_resumable) { if (flags.isprefix || flags.istail || flags.dryrun || flags.resumable || flags.nomount) { (void) fprintf(stderr, gettext("invalid option\n")); usage(B_FALSE); } char namebuf[ZFS_MAX_DATASET_NAME_LEN]; (void) snprintf(namebuf, sizeof (namebuf), "%s/%%recv", argv[0]); if (zfs_dataset_exists(g_zfs, namebuf, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME)) { zfs_handle_t *zhp = zfs_open(g_zfs, namebuf, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); if (zhp == NULL) { nvlist_free(props); return (1); } err = zfs_destroy(zhp, B_FALSE); zfs_close(zhp); } else { zfs_handle_t *zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); if (zhp == NULL) usage(B_FALSE); if (!zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT) || zfs_prop_get(zhp, ZFS_PROP_RECEIVE_RESUME_TOKEN, NULL, 0, NULL, NULL, 0, B_TRUE) == -1) { (void) fprintf(stderr, gettext("'%s' does not have any " "resumable receive state to abort\n"), argv[0]); nvlist_free(props); zfs_close(zhp); return (1); } err = zfs_destroy(zhp, B_FALSE); zfs_close(zhp); } nvlist_free(props); return (err != 0); } if (isatty(STDIN_FILENO)) { (void) fprintf(stderr, gettext("Error: Backup stream can not be read " "from a terminal.\n" "You must redirect standard input.\n")); nvlist_free(props); return (1); } err = zfs_receive(g_zfs, argv[0], props, &flags, STDIN_FILENO, NULL); nvlist_free(props); return (err != 0); } /* * allow/unallow stuff */ /* copied from zfs/sys/dsl_deleg.h */ #define ZFS_DELEG_PERM_CREATE "create" #define ZFS_DELEG_PERM_DESTROY "destroy" #define ZFS_DELEG_PERM_SNAPSHOT "snapshot" #define ZFS_DELEG_PERM_ROLLBACK "rollback" #define ZFS_DELEG_PERM_CLONE "clone" #define ZFS_DELEG_PERM_PROMOTE "promote" #define ZFS_DELEG_PERM_RENAME "rename" #define ZFS_DELEG_PERM_MOUNT "mount" #define ZFS_DELEG_PERM_SHARE "share" #define ZFS_DELEG_PERM_SEND "send" #define ZFS_DELEG_PERM_RECEIVE "receive" #define ZFS_DELEG_PERM_ALLOW "allow" #define ZFS_DELEG_PERM_USERPROP "userprop" #define ZFS_DELEG_PERM_VSCAN "vscan" /* ??? */ #define ZFS_DELEG_PERM_USERQUOTA "userquota" #define ZFS_DELEG_PERM_GROUPQUOTA "groupquota" #define ZFS_DELEG_PERM_USERUSED "userused" #define ZFS_DELEG_PERM_GROUPUSED "groupused" #define ZFS_DELEG_PERM_USEROBJQUOTA "userobjquota" #define ZFS_DELEG_PERM_GROUPOBJQUOTA "groupobjquota" #define ZFS_DELEG_PERM_USEROBJUSED "userobjused" #define ZFS_DELEG_PERM_GROUPOBJUSED "groupobjused" #define ZFS_DELEG_PERM_HOLD "hold" #define ZFS_DELEG_PERM_RELEASE "release" #define ZFS_DELEG_PERM_DIFF "diff" #define ZFS_DELEG_PERM_BOOKMARK "bookmark" #define ZFS_DELEG_PERM_LOAD_KEY "load-key" #define ZFS_DELEG_PERM_CHANGE_KEY "change-key" #define ZFS_DELEG_PERM_PROJECTUSED "projectused" #define ZFS_DELEG_PERM_PROJECTQUOTA "projectquota" #define ZFS_DELEG_PERM_PROJECTOBJUSED "projectobjused" #define ZFS_DELEG_PERM_PROJECTOBJQUOTA "projectobjquota" #define ZFS_NUM_DELEG_NOTES ZFS_DELEG_NOTE_NONE static zfs_deleg_perm_tab_t zfs_deleg_perm_tbl[] = { { ZFS_DELEG_PERM_ALLOW, ZFS_DELEG_NOTE_ALLOW }, { ZFS_DELEG_PERM_CLONE, ZFS_DELEG_NOTE_CLONE }, { ZFS_DELEG_PERM_CREATE, ZFS_DELEG_NOTE_CREATE }, { ZFS_DELEG_PERM_DESTROY, ZFS_DELEG_NOTE_DESTROY }, { ZFS_DELEG_PERM_DIFF, ZFS_DELEG_NOTE_DIFF}, { ZFS_DELEG_PERM_HOLD, ZFS_DELEG_NOTE_HOLD }, { ZFS_DELEG_PERM_MOUNT, ZFS_DELEG_NOTE_MOUNT }, { ZFS_DELEG_PERM_PROMOTE, ZFS_DELEG_NOTE_PROMOTE }, { ZFS_DELEG_PERM_RECEIVE, ZFS_DELEG_NOTE_RECEIVE }, { ZFS_DELEG_PERM_RELEASE, ZFS_DELEG_NOTE_RELEASE }, { ZFS_DELEG_PERM_RENAME, ZFS_DELEG_NOTE_RENAME }, { ZFS_DELEG_PERM_ROLLBACK, ZFS_DELEG_NOTE_ROLLBACK }, { ZFS_DELEG_PERM_SEND, ZFS_DELEG_NOTE_SEND }, { ZFS_DELEG_PERM_SHARE, ZFS_DELEG_NOTE_SHARE }, { ZFS_DELEG_PERM_SNAPSHOT, ZFS_DELEG_NOTE_SNAPSHOT }, { ZFS_DELEG_PERM_BOOKMARK, ZFS_DELEG_NOTE_BOOKMARK }, { ZFS_DELEG_PERM_LOAD_KEY, ZFS_DELEG_NOTE_LOAD_KEY }, { ZFS_DELEG_PERM_CHANGE_KEY, ZFS_DELEG_NOTE_CHANGE_KEY }, { ZFS_DELEG_PERM_GROUPQUOTA, ZFS_DELEG_NOTE_GROUPQUOTA }, { ZFS_DELEG_PERM_GROUPUSED, ZFS_DELEG_NOTE_GROUPUSED }, { ZFS_DELEG_PERM_USERPROP, ZFS_DELEG_NOTE_USERPROP }, { ZFS_DELEG_PERM_USERQUOTA, ZFS_DELEG_NOTE_USERQUOTA }, { ZFS_DELEG_PERM_USERUSED, ZFS_DELEG_NOTE_USERUSED }, { ZFS_DELEG_PERM_USEROBJQUOTA, ZFS_DELEG_NOTE_USEROBJQUOTA }, { ZFS_DELEG_PERM_USEROBJUSED, ZFS_DELEG_NOTE_USEROBJUSED }, { ZFS_DELEG_PERM_GROUPOBJQUOTA, ZFS_DELEG_NOTE_GROUPOBJQUOTA }, { ZFS_DELEG_PERM_GROUPOBJUSED, ZFS_DELEG_NOTE_GROUPOBJUSED }, { ZFS_DELEG_PERM_PROJECTUSED, ZFS_DELEG_NOTE_PROJECTUSED }, { ZFS_DELEG_PERM_PROJECTQUOTA, ZFS_DELEG_NOTE_PROJECTQUOTA }, { ZFS_DELEG_PERM_PROJECTOBJUSED, ZFS_DELEG_NOTE_PROJECTOBJUSED }, { ZFS_DELEG_PERM_PROJECTOBJQUOTA, ZFS_DELEG_NOTE_PROJECTOBJQUOTA }, { NULL, ZFS_DELEG_NOTE_NONE } }; /* permission structure */ typedef struct deleg_perm { zfs_deleg_who_type_t dp_who_type; const char *dp_name; boolean_t dp_local; boolean_t dp_descend; } deleg_perm_t; /* */ typedef struct deleg_perm_node { deleg_perm_t dpn_perm; uu_avl_node_t dpn_avl_node; } deleg_perm_node_t; typedef struct fs_perm fs_perm_t; /* permissions set */ typedef struct who_perm { zfs_deleg_who_type_t who_type; const char *who_name; /* id */ char who_ug_name[256]; /* user/group name */ fs_perm_t *who_fsperm; /* uplink */ uu_avl_t *who_deleg_perm_avl; /* permissions */ } who_perm_t; /* */ typedef struct who_perm_node { who_perm_t who_perm; uu_avl_node_t who_avl_node; } who_perm_node_t; typedef struct fs_perm_set fs_perm_set_t; /* fs permissions */ struct fs_perm { const char *fsp_name; uu_avl_t *fsp_sc_avl; /* sets,create */ uu_avl_t *fsp_uge_avl; /* user,group,everyone */ fs_perm_set_t *fsp_set; /* uplink */ }; /* */ typedef struct fs_perm_node { fs_perm_t fspn_fsperm; uu_avl_t *fspn_avl; uu_list_node_t fspn_list_node; } fs_perm_node_t; /* top level structure */ struct fs_perm_set { uu_list_pool_t *fsps_list_pool; uu_list_t *fsps_list; /* list of fs_perms */ uu_avl_pool_t *fsps_named_set_avl_pool; uu_avl_pool_t *fsps_who_perm_avl_pool; uu_avl_pool_t *fsps_deleg_perm_avl_pool; }; static inline const char * deleg_perm_type(zfs_deleg_note_t note) { /* subcommands */ switch (note) { /* SUBCOMMANDS */ /* OTHER */ case ZFS_DELEG_NOTE_GROUPQUOTA: case ZFS_DELEG_NOTE_GROUPUSED: case ZFS_DELEG_NOTE_USERPROP: case ZFS_DELEG_NOTE_USERQUOTA: case ZFS_DELEG_NOTE_USERUSED: case ZFS_DELEG_NOTE_USEROBJQUOTA: case ZFS_DELEG_NOTE_USEROBJUSED: case ZFS_DELEG_NOTE_GROUPOBJQUOTA: case ZFS_DELEG_NOTE_GROUPOBJUSED: case ZFS_DELEG_NOTE_PROJECTUSED: case ZFS_DELEG_NOTE_PROJECTQUOTA: case ZFS_DELEG_NOTE_PROJECTOBJUSED: case ZFS_DELEG_NOTE_PROJECTOBJQUOTA: /* other */ return (gettext("other")); default: return (gettext("subcommand")); } } static int who_type2weight(zfs_deleg_who_type_t who_type) { int res; switch (who_type) { case ZFS_DELEG_NAMED_SET_SETS: case ZFS_DELEG_NAMED_SET: res = 0; break; case ZFS_DELEG_CREATE_SETS: case ZFS_DELEG_CREATE: res = 1; break; case ZFS_DELEG_USER_SETS: case ZFS_DELEG_USER: res = 2; break; case ZFS_DELEG_GROUP_SETS: case ZFS_DELEG_GROUP: res = 3; break; case ZFS_DELEG_EVERYONE_SETS: case ZFS_DELEG_EVERYONE: res = 4; break; default: res = -1; } return (res); } static int who_perm_compare(const void *larg, const void *rarg, void *unused) { (void) unused; const who_perm_node_t *l = larg; const who_perm_node_t *r = rarg; zfs_deleg_who_type_t ltype = l->who_perm.who_type; zfs_deleg_who_type_t rtype = r->who_perm.who_type; int lweight = who_type2weight(ltype); int rweight = who_type2weight(rtype); int res = lweight - rweight; if (res == 0) res = strncmp(l->who_perm.who_name, r->who_perm.who_name, ZFS_MAX_DELEG_NAME-1); if (res == 0) return (0); if (res > 0) return (1); else return (-1); } static int deleg_perm_compare(const void *larg, const void *rarg, void *unused) { (void) unused; const deleg_perm_node_t *l = larg; const deleg_perm_node_t *r = rarg; int res = strncmp(l->dpn_perm.dp_name, r->dpn_perm.dp_name, ZFS_MAX_DELEG_NAME-1); if (res == 0) return (0); if (res > 0) return (1); else return (-1); } static inline void fs_perm_set_init(fs_perm_set_t *fspset) { memset(fspset, 0, sizeof (fs_perm_set_t)); if ((fspset->fsps_list_pool = uu_list_pool_create("fsps_list_pool", sizeof (fs_perm_node_t), offsetof(fs_perm_node_t, fspn_list_node), NULL, UU_DEFAULT)) == NULL) nomem(); if ((fspset->fsps_list = uu_list_create(fspset->fsps_list_pool, NULL, UU_DEFAULT)) == NULL) nomem(); if ((fspset->fsps_named_set_avl_pool = uu_avl_pool_create( "named_set_avl_pool", sizeof (who_perm_node_t), offsetof( who_perm_node_t, who_avl_node), who_perm_compare, UU_DEFAULT)) == NULL) nomem(); if ((fspset->fsps_who_perm_avl_pool = uu_avl_pool_create( "who_perm_avl_pool", sizeof (who_perm_node_t), offsetof( who_perm_node_t, who_avl_node), who_perm_compare, UU_DEFAULT)) == NULL) nomem(); if ((fspset->fsps_deleg_perm_avl_pool = uu_avl_pool_create( "deleg_perm_avl_pool", sizeof (deleg_perm_node_t), offsetof( deleg_perm_node_t, dpn_avl_node), deleg_perm_compare, UU_DEFAULT)) == NULL) nomem(); } static inline void fs_perm_fini(fs_perm_t *); static inline void who_perm_fini(who_perm_t *); static inline void fs_perm_set_fini(fs_perm_set_t *fspset) { fs_perm_node_t *node = uu_list_first(fspset->fsps_list); while (node != NULL) { fs_perm_node_t *next_node = uu_list_next(fspset->fsps_list, node); fs_perm_t *fsperm = &node->fspn_fsperm; fs_perm_fini(fsperm); uu_list_remove(fspset->fsps_list, node); free(node); node = next_node; } uu_avl_pool_destroy(fspset->fsps_named_set_avl_pool); uu_avl_pool_destroy(fspset->fsps_who_perm_avl_pool); uu_avl_pool_destroy(fspset->fsps_deleg_perm_avl_pool); } static inline void deleg_perm_init(deleg_perm_t *deleg_perm, zfs_deleg_who_type_t type, const char *name) { deleg_perm->dp_who_type = type; deleg_perm->dp_name = name; } static inline void who_perm_init(who_perm_t *who_perm, fs_perm_t *fsperm, zfs_deleg_who_type_t type, const char *name) { uu_avl_pool_t *pool; pool = fsperm->fsp_set->fsps_deleg_perm_avl_pool; memset(who_perm, 0, sizeof (who_perm_t)); if ((who_perm->who_deleg_perm_avl = uu_avl_create(pool, NULL, UU_DEFAULT)) == NULL) nomem(); who_perm->who_type = type; who_perm->who_name = name; who_perm->who_fsperm = fsperm; } static inline void who_perm_fini(who_perm_t *who_perm) { deleg_perm_node_t *node = uu_avl_first(who_perm->who_deleg_perm_avl); while (node != NULL) { deleg_perm_node_t *next_node = uu_avl_next(who_perm->who_deleg_perm_avl, node); uu_avl_remove(who_perm->who_deleg_perm_avl, node); free(node); node = next_node; } uu_avl_destroy(who_perm->who_deleg_perm_avl); } static inline void fs_perm_init(fs_perm_t *fsperm, fs_perm_set_t *fspset, const char *fsname) { uu_avl_pool_t *nset_pool = fspset->fsps_named_set_avl_pool; uu_avl_pool_t *who_pool = fspset->fsps_who_perm_avl_pool; memset(fsperm, 0, sizeof (fs_perm_t)); if ((fsperm->fsp_sc_avl = uu_avl_create(nset_pool, NULL, UU_DEFAULT)) == NULL) nomem(); if ((fsperm->fsp_uge_avl = uu_avl_create(who_pool, NULL, UU_DEFAULT)) == NULL) nomem(); fsperm->fsp_set = fspset; fsperm->fsp_name = fsname; } static inline void fs_perm_fini(fs_perm_t *fsperm) { who_perm_node_t *node = uu_avl_first(fsperm->fsp_sc_avl); while (node != NULL) { who_perm_node_t *next_node = uu_avl_next(fsperm->fsp_sc_avl, node); who_perm_t *who_perm = &node->who_perm; who_perm_fini(who_perm); uu_avl_remove(fsperm->fsp_sc_avl, node); free(node); node = next_node; } node = uu_avl_first(fsperm->fsp_uge_avl); while (node != NULL) { who_perm_node_t *next_node = uu_avl_next(fsperm->fsp_uge_avl, node); who_perm_t *who_perm = &node->who_perm; who_perm_fini(who_perm); uu_avl_remove(fsperm->fsp_uge_avl, node); free(node); node = next_node; } uu_avl_destroy(fsperm->fsp_sc_avl); uu_avl_destroy(fsperm->fsp_uge_avl); } static void set_deleg_perm_node(uu_avl_t *avl, deleg_perm_node_t *node, zfs_deleg_who_type_t who_type, const char *name, char locality) { uu_avl_index_t idx = 0; deleg_perm_node_t *found_node = NULL; deleg_perm_t *deleg_perm = &node->dpn_perm; deleg_perm_init(deleg_perm, who_type, name); if ((found_node = uu_avl_find(avl, node, NULL, &idx)) == NULL) uu_avl_insert(avl, node, idx); else { node = found_node; deleg_perm = &node->dpn_perm; } switch (locality) { case ZFS_DELEG_LOCAL: deleg_perm->dp_local = B_TRUE; break; case ZFS_DELEG_DESCENDENT: deleg_perm->dp_descend = B_TRUE; break; case ZFS_DELEG_NA: break; default: assert(B_FALSE); /* invalid locality */ } } static inline int parse_who_perm(who_perm_t *who_perm, nvlist_t *nvl, char locality) { nvpair_t *nvp = NULL; fs_perm_set_t *fspset = who_perm->who_fsperm->fsp_set; uu_avl_t *avl = who_perm->who_deleg_perm_avl; zfs_deleg_who_type_t who_type = who_perm->who_type; while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { const char *name = nvpair_name(nvp); data_type_t type = nvpair_type(nvp); uu_avl_pool_t *avl_pool = fspset->fsps_deleg_perm_avl_pool; deleg_perm_node_t *node = safe_malloc(sizeof (deleg_perm_node_t)); VERIFY(type == DATA_TYPE_BOOLEAN); uu_avl_node_init(node, &node->dpn_avl_node, avl_pool); set_deleg_perm_node(avl, node, who_type, name, locality); } return (0); } static inline int parse_fs_perm(fs_perm_t *fsperm, nvlist_t *nvl) { nvpair_t *nvp = NULL; fs_perm_set_t *fspset = fsperm->fsp_set; while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { nvlist_t *nvl2 = NULL; const char *name = nvpair_name(nvp); uu_avl_t *avl = NULL; uu_avl_pool_t *avl_pool = NULL; zfs_deleg_who_type_t perm_type = name[0]; char perm_locality = name[1]; const char *perm_name = name + 3; who_perm_t *who_perm = NULL; assert('$' == name[2]); if (nvpair_value_nvlist(nvp, &nvl2) != 0) return (-1); switch (perm_type) { case ZFS_DELEG_CREATE: case ZFS_DELEG_CREATE_SETS: case ZFS_DELEG_NAMED_SET: case ZFS_DELEG_NAMED_SET_SETS: avl_pool = fspset->fsps_named_set_avl_pool; avl = fsperm->fsp_sc_avl; break; case ZFS_DELEG_USER: case ZFS_DELEG_USER_SETS: case ZFS_DELEG_GROUP: case ZFS_DELEG_GROUP_SETS: case ZFS_DELEG_EVERYONE: case ZFS_DELEG_EVERYONE_SETS: avl_pool = fspset->fsps_who_perm_avl_pool; avl = fsperm->fsp_uge_avl; break; default: assert(!"unhandled zfs_deleg_who_type_t"); } who_perm_node_t *found_node = NULL; who_perm_node_t *node = safe_malloc( sizeof (who_perm_node_t)); who_perm = &node->who_perm; uu_avl_index_t idx = 0; uu_avl_node_init(node, &node->who_avl_node, avl_pool); who_perm_init(who_perm, fsperm, perm_type, perm_name); if ((found_node = uu_avl_find(avl, node, NULL, &idx)) == NULL) { if (avl == fsperm->fsp_uge_avl) { uid_t rid = 0; struct passwd *p = NULL; struct group *g = NULL; const char *nice_name = NULL; switch (perm_type) { case ZFS_DELEG_USER_SETS: case ZFS_DELEG_USER: rid = atoi(perm_name); p = getpwuid(rid); if (p) nice_name = p->pw_name; break; case ZFS_DELEG_GROUP_SETS: case ZFS_DELEG_GROUP: rid = atoi(perm_name); g = getgrgid(rid); if (g) nice_name = g->gr_name; break; default: break; } if (nice_name != NULL) { (void) strlcpy( node->who_perm.who_ug_name, nice_name, 256); } else { /* User or group unknown */ (void) snprintf( node->who_perm.who_ug_name, sizeof (node->who_perm.who_ug_name), "(unknown: %d)", rid); } } uu_avl_insert(avl, node, idx); } else { node = found_node; who_perm = &node->who_perm; } assert(who_perm != NULL); (void) parse_who_perm(who_perm, nvl2, perm_locality); } return (0); } static inline int parse_fs_perm_set(fs_perm_set_t *fspset, nvlist_t *nvl) { nvpair_t *nvp = NULL; uu_avl_index_t idx = 0; while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { nvlist_t *nvl2 = NULL; const char *fsname = nvpair_name(nvp); data_type_t type = nvpair_type(nvp); fs_perm_t *fsperm = NULL; fs_perm_node_t *node = safe_malloc(sizeof (fs_perm_node_t)); fsperm = &node->fspn_fsperm; VERIFY(DATA_TYPE_NVLIST == type); uu_list_node_init(node, &node->fspn_list_node, fspset->fsps_list_pool); idx = uu_list_numnodes(fspset->fsps_list); fs_perm_init(fsperm, fspset, fsname); if (nvpair_value_nvlist(nvp, &nvl2) != 0) return (-1); (void) parse_fs_perm(fsperm, nvl2); uu_list_insert(fspset->fsps_list, node, idx); } return (0); } static inline const char * deleg_perm_comment(zfs_deleg_note_t note) { const char *str = ""; /* subcommands */ switch (note) { /* SUBCOMMANDS */ case ZFS_DELEG_NOTE_ALLOW: str = gettext("Must also have the permission that is being" "\n\t\t\t\tallowed"); break; case ZFS_DELEG_NOTE_CLONE: str = gettext("Must also have the 'create' ability and 'mount'" "\n\t\t\t\tability in the origin file system"); break; case ZFS_DELEG_NOTE_CREATE: str = gettext("Must also have the 'mount' ability"); break; case ZFS_DELEG_NOTE_DESTROY: str = gettext("Must also have the 'mount' ability"); break; case ZFS_DELEG_NOTE_DIFF: str = gettext("Allows lookup of paths within a dataset;" "\n\t\t\t\tgiven an object number. Ordinary users need this" "\n\t\t\t\tin order to use zfs diff"); break; case ZFS_DELEG_NOTE_HOLD: str = gettext("Allows adding a user hold to a snapshot"); break; case ZFS_DELEG_NOTE_MOUNT: str = gettext("Allows mount/umount of ZFS datasets"); break; case ZFS_DELEG_NOTE_PROMOTE: str = gettext("Must also have the 'mount'\n\t\t\t\tand" " 'promote' ability in the origin file system"); break; case ZFS_DELEG_NOTE_RECEIVE: str = gettext("Must also have the 'mount' and 'create'" " ability"); break; case ZFS_DELEG_NOTE_RELEASE: str = gettext("Allows releasing a user hold which\n\t\t\t\t" "might destroy the snapshot"); break; case ZFS_DELEG_NOTE_RENAME: str = gettext("Must also have the 'mount' and 'create'" "\n\t\t\t\tability in the new parent"); break; case ZFS_DELEG_NOTE_ROLLBACK: str = gettext(""); break; case ZFS_DELEG_NOTE_SEND: str = gettext(""); break; case ZFS_DELEG_NOTE_SHARE: str = gettext("Allows sharing file systems over NFS or SMB" "\n\t\t\t\tprotocols"); break; case ZFS_DELEG_NOTE_SNAPSHOT: str = gettext(""); break; case ZFS_DELEG_NOTE_LOAD_KEY: str = gettext("Allows loading or unloading an encryption key"); break; case ZFS_DELEG_NOTE_CHANGE_KEY: str = gettext("Allows changing or adding an encryption key"); break; /* * case ZFS_DELEG_NOTE_VSCAN: * str = gettext(""); * break; */ /* OTHER */ case ZFS_DELEG_NOTE_GROUPQUOTA: str = gettext("Allows accessing any groupquota@... property"); break; case ZFS_DELEG_NOTE_GROUPUSED: str = gettext("Allows reading any groupused@... property"); break; case ZFS_DELEG_NOTE_USERPROP: str = gettext("Allows changing any user property"); break; case ZFS_DELEG_NOTE_USERQUOTA: str = gettext("Allows accessing any userquota@... property"); break; case ZFS_DELEG_NOTE_USERUSED: str = gettext("Allows reading any userused@... property"); break; case ZFS_DELEG_NOTE_USEROBJQUOTA: str = gettext("Allows accessing any userobjquota@... property"); break; case ZFS_DELEG_NOTE_GROUPOBJQUOTA: str = gettext("Allows accessing any \n\t\t\t\t" "groupobjquota@... property"); break; case ZFS_DELEG_NOTE_GROUPOBJUSED: str = gettext("Allows reading any groupobjused@... property"); break; case ZFS_DELEG_NOTE_USEROBJUSED: str = gettext("Allows reading any userobjused@... property"); break; case ZFS_DELEG_NOTE_PROJECTQUOTA: str = gettext("Allows accessing any projectquota@... property"); break; case ZFS_DELEG_NOTE_PROJECTOBJQUOTA: str = gettext("Allows accessing any \n\t\t\t\t" "projectobjquota@... property"); break; case ZFS_DELEG_NOTE_PROJECTUSED: str = gettext("Allows reading any projectused@... property"); break; case ZFS_DELEG_NOTE_PROJECTOBJUSED: str = gettext("Allows accessing any \n\t\t\t\t" "projectobjused@... property"); break; /* other */ default: str = ""; } return (str); } struct allow_opts { boolean_t local; boolean_t descend; boolean_t user; boolean_t group; boolean_t everyone; boolean_t create; boolean_t set; boolean_t recursive; /* unallow only */ boolean_t prt_usage; boolean_t prt_perms; char *who; char *perms; const char *dataset; }; static inline int prop_cmp(const void *a, const void *b) { const char *str1 = *(const char **)a; const char *str2 = *(const char **)b; return (strcmp(str1, str2)); } static void allow_usage(boolean_t un, boolean_t requested, const char *msg) { const char *opt_desc[] = { "-h", gettext("show this help message and exit"), "-l", gettext("set permission locally"), "-d", gettext("set permission for descents"), "-u", gettext("set permission for user"), "-g", gettext("set permission for group"), "-e", gettext("set permission for everyone"), "-c", gettext("set create time permission"), "-s", gettext("define permission set"), /* unallow only */ "-r", gettext("remove permissions recursively"), }; size_t unallow_size = sizeof (opt_desc) / sizeof (char *); size_t allow_size = unallow_size - 2; const char *props[ZFS_NUM_PROPS]; int i; size_t count = 0; FILE *fp = requested ? stdout : stderr; zprop_desc_t *pdtbl = zfs_prop_get_table(); const char *fmt = gettext("%-16s %-14s\t%s\n"); (void) fprintf(fp, gettext("Usage: %s\n"), get_usage(un ? HELP_UNALLOW : HELP_ALLOW)); (void) fprintf(fp, gettext("Options:\n")); for (i = 0; i < (un ? unallow_size : allow_size); i += 2) { const char *opt = opt_desc[i]; const char *optdsc = opt_desc[i + 1]; (void) fprintf(fp, gettext(" %-10s %s\n"), opt, optdsc); } (void) fprintf(fp, gettext("\nThe following permissions are " "supported:\n\n")); (void) fprintf(fp, fmt, gettext("NAME"), gettext("TYPE"), gettext("NOTES")); for (i = 0; i < ZFS_NUM_DELEG_NOTES; i++) { const char *perm_name = zfs_deleg_perm_tbl[i].z_perm; zfs_deleg_note_t perm_note = zfs_deleg_perm_tbl[i].z_note; const char *perm_type = deleg_perm_type(perm_note); const char *perm_comment = deleg_perm_comment(perm_note); (void) fprintf(fp, fmt, perm_name, perm_type, perm_comment); } for (i = 0; i < ZFS_NUM_PROPS; i++) { zprop_desc_t *pd = &pdtbl[i]; if (pd->pd_visible != B_TRUE) continue; if (pd->pd_attr == PROP_READONLY) continue; props[count++] = pd->pd_name; } props[count] = NULL; qsort(props, count, sizeof (char *), prop_cmp); for (i = 0; i < count; i++) (void) fprintf(fp, fmt, props[i], gettext("property"), ""); if (msg != NULL) (void) fprintf(fp, gettext("\nzfs: error: %s"), msg); exit(requested ? 0 : 2); } static inline const char * munge_args(int argc, char **argv, boolean_t un, size_t expected_argc, char **permsp) { if (un && argc == expected_argc - 1) *permsp = NULL; else if (argc == expected_argc) *permsp = argv[argc - 2]; else allow_usage(un, B_FALSE, gettext("wrong number of parameters\n")); return (argv[argc - 1]); } static void parse_allow_args(int argc, char **argv, boolean_t un, struct allow_opts *opts) { int uge_sum = opts->user + opts->group + opts->everyone; int csuge_sum = opts->create + opts->set + uge_sum; int ldcsuge_sum = csuge_sum + opts->local + opts->descend; int all_sum = un ? ldcsuge_sum + opts->recursive : ldcsuge_sum; if (uge_sum > 1) allow_usage(un, B_FALSE, gettext("-u, -g, and -e are mutually exclusive\n")); if (opts->prt_usage) { if (argc == 0 && all_sum == 0) allow_usage(un, B_TRUE, NULL); else usage(B_FALSE); } if (opts->set) { if (csuge_sum > 1) allow_usage(un, B_FALSE, gettext("invalid options combined with -s\n")); opts->dataset = munge_args(argc, argv, un, 3, &opts->perms); if (argv[0][0] != '@') allow_usage(un, B_FALSE, gettext("invalid set name: missing '@' prefix\n")); opts->who = argv[0]; } else if (opts->create) { if (ldcsuge_sum > 1) allow_usage(un, B_FALSE, gettext("invalid options combined with -c\n")); opts->dataset = munge_args(argc, argv, un, 2, &opts->perms); } else if (opts->everyone) { if (csuge_sum > 1) allow_usage(un, B_FALSE, gettext("invalid options combined with -e\n")); opts->dataset = munge_args(argc, argv, un, 2, &opts->perms); } else if (uge_sum == 0 && argc > 0 && strcmp(argv[0], "everyone") == 0) { opts->everyone = B_TRUE; argc--; argv++; opts->dataset = munge_args(argc, argv, un, 2, &opts->perms); } else if (argc == 1 && !un) { opts->prt_perms = B_TRUE; opts->dataset = argv[argc-1]; } else { opts->dataset = munge_args(argc, argv, un, 3, &opts->perms); opts->who = argv[0]; } if (!opts->local && !opts->descend) { opts->local = B_TRUE; opts->descend = B_TRUE; } } static void store_allow_perm(zfs_deleg_who_type_t type, boolean_t local, boolean_t descend, const char *who, char *perms, nvlist_t *top_nvl) { int i; char ld[2] = { '\0', '\0' }; char who_buf[MAXNAMELEN + 32]; char base_type = '\0'; char set_type = '\0'; nvlist_t *base_nvl = NULL; nvlist_t *set_nvl = NULL; nvlist_t *nvl; if (nvlist_alloc(&base_nvl, NV_UNIQUE_NAME, 0) != 0) nomem(); if (nvlist_alloc(&set_nvl, NV_UNIQUE_NAME, 0) != 0) nomem(); switch (type) { case ZFS_DELEG_NAMED_SET_SETS: case ZFS_DELEG_NAMED_SET: set_type = ZFS_DELEG_NAMED_SET_SETS; base_type = ZFS_DELEG_NAMED_SET; ld[0] = ZFS_DELEG_NA; break; case ZFS_DELEG_CREATE_SETS: case ZFS_DELEG_CREATE: set_type = ZFS_DELEG_CREATE_SETS; base_type = ZFS_DELEG_CREATE; ld[0] = ZFS_DELEG_NA; break; case ZFS_DELEG_USER_SETS: case ZFS_DELEG_USER: set_type = ZFS_DELEG_USER_SETS; base_type = ZFS_DELEG_USER; if (local) ld[0] = ZFS_DELEG_LOCAL; if (descend) ld[1] = ZFS_DELEG_DESCENDENT; break; case ZFS_DELEG_GROUP_SETS: case ZFS_DELEG_GROUP: set_type = ZFS_DELEG_GROUP_SETS; base_type = ZFS_DELEG_GROUP; if (local) ld[0] = ZFS_DELEG_LOCAL; if (descend) ld[1] = ZFS_DELEG_DESCENDENT; break; case ZFS_DELEG_EVERYONE_SETS: case ZFS_DELEG_EVERYONE: set_type = ZFS_DELEG_EVERYONE_SETS; base_type = ZFS_DELEG_EVERYONE; if (local) ld[0] = ZFS_DELEG_LOCAL; if (descend) ld[1] = ZFS_DELEG_DESCENDENT; break; default: assert(set_type != '\0' && base_type != '\0'); } if (perms != NULL) { char *curr = perms; char *end = curr + strlen(perms); while (curr < end) { char *delim = strchr(curr, ','); if (delim == NULL) delim = end; else *delim = '\0'; if (curr[0] == '@') nvl = set_nvl; else nvl = base_nvl; (void) nvlist_add_boolean(nvl, curr); if (delim != end) *delim = ','; curr = delim + 1; } for (i = 0; i < 2; i++) { char locality = ld[i]; if (locality == 0) continue; if (!nvlist_empty(base_nvl)) { if (who != NULL) (void) snprintf(who_buf, sizeof (who_buf), "%c%c$%s", base_type, locality, who); else (void) snprintf(who_buf, sizeof (who_buf), "%c%c$", base_type, locality); (void) nvlist_add_nvlist(top_nvl, who_buf, base_nvl); } if (!nvlist_empty(set_nvl)) { if (who != NULL) (void) snprintf(who_buf, sizeof (who_buf), "%c%c$%s", set_type, locality, who); else (void) snprintf(who_buf, sizeof (who_buf), "%c%c$", set_type, locality); (void) nvlist_add_nvlist(top_nvl, who_buf, set_nvl); } } } else { for (i = 0; i < 2; i++) { char locality = ld[i]; if (locality == 0) continue; if (who != NULL) (void) snprintf(who_buf, sizeof (who_buf), "%c%c$%s", base_type, locality, who); else (void) snprintf(who_buf, sizeof (who_buf), "%c%c$", base_type, locality); (void) nvlist_add_boolean(top_nvl, who_buf); if (who != NULL) (void) snprintf(who_buf, sizeof (who_buf), "%c%c$%s", set_type, locality, who); else (void) snprintf(who_buf, sizeof (who_buf), "%c%c$", set_type, locality); (void) nvlist_add_boolean(top_nvl, who_buf); } } } static int construct_fsacl_list(boolean_t un, struct allow_opts *opts, nvlist_t **nvlp) { if (nvlist_alloc(nvlp, NV_UNIQUE_NAME, 0) != 0) nomem(); if (opts->set) { store_allow_perm(ZFS_DELEG_NAMED_SET, opts->local, opts->descend, opts->who, opts->perms, *nvlp); } else if (opts->create) { store_allow_perm(ZFS_DELEG_CREATE, opts->local, opts->descend, NULL, opts->perms, *nvlp); } else if (opts->everyone) { store_allow_perm(ZFS_DELEG_EVERYONE, opts->local, opts->descend, NULL, opts->perms, *nvlp); } else { char *curr = opts->who; char *end = curr + strlen(curr); while (curr < end) { const char *who; zfs_deleg_who_type_t who_type = ZFS_DELEG_WHO_UNKNOWN; char *endch; char *delim = strchr(curr, ','); char errbuf[256]; char id[64]; struct passwd *p = NULL; struct group *g = NULL; uid_t rid; if (delim == NULL) delim = end; else *delim = '\0'; rid = (uid_t)strtol(curr, &endch, 0); if (opts->user) { who_type = ZFS_DELEG_USER; if (*endch != '\0') p = getpwnam(curr); else p = getpwuid(rid); if (p != NULL) rid = p->pw_uid; else if (*endch != '\0') { (void) snprintf(errbuf, 256, gettext( "invalid user %s\n"), curr); allow_usage(un, B_TRUE, errbuf); } } else if (opts->group) { who_type = ZFS_DELEG_GROUP; if (*endch != '\0') g = getgrnam(curr); else g = getgrgid(rid); if (g != NULL) rid = g->gr_gid; else if (*endch != '\0') { (void) snprintf(errbuf, 256, gettext( "invalid group %s\n"), curr); allow_usage(un, B_TRUE, errbuf); } } else { if (*endch != '\0') { p = getpwnam(curr); } else { p = getpwuid(rid); } if (p == NULL) { if (*endch != '\0') { g = getgrnam(curr); } else { g = getgrgid(rid); } } if (p != NULL) { who_type = ZFS_DELEG_USER; rid = p->pw_uid; } else if (g != NULL) { who_type = ZFS_DELEG_GROUP; rid = g->gr_gid; } else { (void) snprintf(errbuf, 256, gettext( "invalid user/group %s\n"), curr); allow_usage(un, B_TRUE, errbuf); } } (void) sprintf(id, "%u", rid); who = id; store_allow_perm(who_type, opts->local, opts->descend, who, opts->perms, *nvlp); curr = delim + 1; } } return (0); } static void print_set_creat_perms(uu_avl_t *who_avl) { const char *sc_title[] = { gettext("Permission sets:\n"), gettext("Create time permissions:\n"), NULL }; who_perm_node_t *who_node = NULL; int prev_weight = -1; for (who_node = uu_avl_first(who_avl); who_node != NULL; who_node = uu_avl_next(who_avl, who_node)) { uu_avl_t *avl = who_node->who_perm.who_deleg_perm_avl; zfs_deleg_who_type_t who_type = who_node->who_perm.who_type; const char *who_name = who_node->who_perm.who_name; int weight = who_type2weight(who_type); boolean_t first = B_TRUE; deleg_perm_node_t *deleg_node; if (prev_weight != weight) { (void) printf("%s", sc_title[weight]); prev_weight = weight; } if (who_name == NULL || strnlen(who_name, 1) == 0) (void) printf("\t"); else (void) printf("\t%s ", who_name); for (deleg_node = uu_avl_first(avl); deleg_node != NULL; deleg_node = uu_avl_next(avl, deleg_node)) { if (first) { (void) printf("%s", deleg_node->dpn_perm.dp_name); first = B_FALSE; } else (void) printf(",%s", deleg_node->dpn_perm.dp_name); } (void) printf("\n"); } } static void print_uge_deleg_perms(uu_avl_t *who_avl, boolean_t local, boolean_t descend, const char *title) { who_perm_node_t *who_node = NULL; boolean_t prt_title = B_TRUE; uu_avl_walk_t *walk; if ((walk = uu_avl_walk_start(who_avl, UU_WALK_ROBUST)) == NULL) nomem(); while ((who_node = uu_avl_walk_next(walk)) != NULL) { const char *who_name = who_node->who_perm.who_name; const char *nice_who_name = who_node->who_perm.who_ug_name; uu_avl_t *avl = who_node->who_perm.who_deleg_perm_avl; zfs_deleg_who_type_t who_type = who_node->who_perm.who_type; char delim = ' '; deleg_perm_node_t *deleg_node; boolean_t prt_who = B_TRUE; for (deleg_node = uu_avl_first(avl); deleg_node != NULL; deleg_node = uu_avl_next(avl, deleg_node)) { if (local != deleg_node->dpn_perm.dp_local || descend != deleg_node->dpn_perm.dp_descend) continue; if (prt_who) { const char *who = NULL; if (prt_title) { prt_title = B_FALSE; (void) printf("%s", title); } switch (who_type) { case ZFS_DELEG_USER_SETS: case ZFS_DELEG_USER: who = gettext("user"); if (nice_who_name) who_name = nice_who_name; break; case ZFS_DELEG_GROUP_SETS: case ZFS_DELEG_GROUP: who = gettext("group"); if (nice_who_name) who_name = nice_who_name; break; case ZFS_DELEG_EVERYONE_SETS: case ZFS_DELEG_EVERYONE: who = gettext("everyone"); who_name = NULL; break; default: assert(who != NULL); } prt_who = B_FALSE; if (who_name == NULL) (void) printf("\t%s", who); else (void) printf("\t%s %s", who, who_name); } (void) printf("%c%s", delim, deleg_node->dpn_perm.dp_name); delim = ','; } if (!prt_who) (void) printf("\n"); } uu_avl_walk_end(walk); } static void print_fs_perms(fs_perm_set_t *fspset) { fs_perm_node_t *node = NULL; char buf[MAXNAMELEN + 32]; const char *dsname = buf; for (node = uu_list_first(fspset->fsps_list); node != NULL; node = uu_list_next(fspset->fsps_list, node)) { uu_avl_t *sc_avl = node->fspn_fsperm.fsp_sc_avl; uu_avl_t *uge_avl = node->fspn_fsperm.fsp_uge_avl; int left = 0; (void) snprintf(buf, sizeof (buf), gettext("---- Permissions on %s "), node->fspn_fsperm.fsp_name); (void) printf("%s", dsname); left = 70 - strlen(buf); while (left-- > 0) (void) printf("-"); (void) printf("\n"); print_set_creat_perms(sc_avl); print_uge_deleg_perms(uge_avl, B_TRUE, B_FALSE, gettext("Local permissions:\n")); print_uge_deleg_perms(uge_avl, B_FALSE, B_TRUE, gettext("Descendent permissions:\n")); print_uge_deleg_perms(uge_avl, B_TRUE, B_TRUE, gettext("Local+Descendent permissions:\n")); } } static fs_perm_set_t fs_perm_set = { NULL, NULL, NULL, NULL }; struct deleg_perms { boolean_t un; nvlist_t *nvl; }; static int set_deleg_perms(zfs_handle_t *zhp, void *data) { struct deleg_perms *perms = (struct deleg_perms *)data; zfs_type_t zfs_type = zfs_get_type(zhp); if (zfs_type != ZFS_TYPE_FILESYSTEM && zfs_type != ZFS_TYPE_VOLUME) return (0); return (zfs_set_fsacl(zhp, perms->un, perms->nvl)); } static int zfs_do_allow_unallow_impl(int argc, char **argv, boolean_t un) { zfs_handle_t *zhp; nvlist_t *perm_nvl = NULL; nvlist_t *update_perm_nvl = NULL; int error = 1; int c; struct allow_opts opts = { 0 }; const char *optstr = un ? "ldugecsrh" : "ldugecsh"; /* check opts */ while ((c = getopt(argc, argv, optstr)) != -1) { switch (c) { case 'l': opts.local = B_TRUE; break; case 'd': opts.descend = B_TRUE; break; case 'u': opts.user = B_TRUE; break; case 'g': opts.group = B_TRUE; break; case 'e': opts.everyone = B_TRUE; break; case 's': opts.set = B_TRUE; break; case 'c': opts.create = B_TRUE; break; case 'r': opts.recursive = B_TRUE; break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); usage(B_FALSE); break; case 'h': opts.prt_usage = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* check arguments */ parse_allow_args(argc, argv, un, &opts); /* try to open the dataset */ if ((zhp = zfs_open(g_zfs, opts.dataset, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME)) == NULL) { (void) fprintf(stderr, "Failed to open dataset: %s\n", opts.dataset); return (-1); } if (zfs_get_fsacl(zhp, &perm_nvl) != 0) goto cleanup2; fs_perm_set_init(&fs_perm_set); if (parse_fs_perm_set(&fs_perm_set, perm_nvl) != 0) { (void) fprintf(stderr, "Failed to parse fsacl permissions\n"); goto cleanup1; } if (opts.prt_perms) print_fs_perms(&fs_perm_set); else { (void) construct_fsacl_list(un, &opts, &update_perm_nvl); if (zfs_set_fsacl(zhp, un, update_perm_nvl) != 0) goto cleanup0; if (un && opts.recursive) { struct deleg_perms data = { un, update_perm_nvl }; if (zfs_iter_filesystems(zhp, set_deleg_perms, &data) != 0) goto cleanup0; } } error = 0; cleanup0: nvlist_free(perm_nvl); nvlist_free(update_perm_nvl); cleanup1: fs_perm_set_fini(&fs_perm_set); cleanup2: zfs_close(zhp); return (error); } static int zfs_do_allow(int argc, char **argv) { return (zfs_do_allow_unallow_impl(argc, argv, B_FALSE)); } static int zfs_do_unallow(int argc, char **argv) { return (zfs_do_allow_unallow_impl(argc, argv, B_TRUE)); } static int zfs_do_hold_rele_impl(int argc, char **argv, boolean_t holding) { int errors = 0; int i; const char *tag; boolean_t recursive = B_FALSE; const char *opts = holding ? "rt" : "r"; int c; /* check options */ while ((c = getopt(argc, argv, opts)) != -1) { switch (c) { case 'r': recursive = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* check number of arguments */ if (argc < 2) usage(B_FALSE); tag = argv[0]; --argc; ++argv; if (holding && tag[0] == '.') { /* tags starting with '.' are reserved for libzfs */ (void) fprintf(stderr, gettext("tag may not start with '.'\n")); usage(B_FALSE); } for (i = 0; i < argc; ++i) { zfs_handle_t *zhp; char parent[ZFS_MAX_DATASET_NAME_LEN]; const char *delim; char *path = argv[i]; delim = strchr(path, '@'); if (delim == NULL) { (void) fprintf(stderr, gettext("'%s' is not a snapshot\n"), path); ++errors; continue; } (void) strlcpy(parent, path, MIN(sizeof (parent), delim - path + 1)); zhp = zfs_open(g_zfs, parent, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); if (zhp == NULL) { ++errors; continue; } if (holding) { if (zfs_hold(zhp, delim+1, tag, recursive, -1) != 0) ++errors; } else { if (zfs_release(zhp, delim+1, tag, recursive) != 0) ++errors; } zfs_close(zhp); } return (errors != 0); } /* * zfs hold [-r] [-t] ... * * -r Recursively hold * * Apply a user-hold with the given tag to the list of snapshots. */ static int zfs_do_hold(int argc, char **argv) { return (zfs_do_hold_rele_impl(argc, argv, B_TRUE)); } /* * zfs release [-r] ... * * -r Recursively release * * Release a user-hold with the given tag from the list of snapshots. */ static int zfs_do_release(int argc, char **argv) { return (zfs_do_hold_rele_impl(argc, argv, B_FALSE)); } typedef struct holds_cbdata { boolean_t cb_recursive; const char *cb_snapname; nvlist_t **cb_nvlp; size_t cb_max_namelen; size_t cb_max_taglen; } holds_cbdata_t; #define STRFTIME_FMT_STR "%a %b %e %H:%M %Y" #define DATETIME_BUF_LEN (32) /* * */ static void print_holds(boolean_t scripted, int nwidth, int tagwidth, nvlist_t *nvl) { int i; nvpair_t *nvp = NULL; const char *const hdr_cols[] = { "NAME", "TAG", "TIMESTAMP" }; const char *col; if (!scripted) { for (i = 0; i < 3; i++) { col = gettext(hdr_cols[i]); if (i < 2) (void) printf("%-*s ", i ? tagwidth : nwidth, col); else (void) printf("%s\n", col); } } while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { char *zname = nvpair_name(nvp); nvlist_t *nvl2; nvpair_t *nvp2 = NULL; (void) nvpair_value_nvlist(nvp, &nvl2); while ((nvp2 = nvlist_next_nvpair(nvl2, nvp2)) != NULL) { char tsbuf[DATETIME_BUF_LEN]; const char *tagname = nvpair_name(nvp2); uint64_t val = 0; time_t time; struct tm t; (void) nvpair_value_uint64(nvp2, &val); time = (time_t)val; (void) localtime_r(&time, &t); (void) strftime(tsbuf, DATETIME_BUF_LEN, gettext(STRFTIME_FMT_STR), &t); if (scripted) { (void) printf("%s\t%s\t%s\n", zname, tagname, tsbuf); } else { (void) printf("%-*s %-*s %s\n", nwidth, zname, tagwidth, tagname, tsbuf); } } } } /* * Generic callback function to list a dataset or snapshot. */ static int holds_callback(zfs_handle_t *zhp, void *data) { holds_cbdata_t *cbp = data; nvlist_t *top_nvl = *cbp->cb_nvlp; nvlist_t *nvl = NULL; nvpair_t *nvp = NULL; const char *zname = zfs_get_name(zhp); size_t znamelen = strlen(zname); if (cbp->cb_recursive) { const char *snapname; char *delim = strchr(zname, '@'); if (delim == NULL) return (0); snapname = delim + 1; if (strcmp(cbp->cb_snapname, snapname)) return (0); } if (zfs_get_holds(zhp, &nvl) != 0) return (-1); if (znamelen > cbp->cb_max_namelen) cbp->cb_max_namelen = znamelen; while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { const char *tag = nvpair_name(nvp); size_t taglen = strlen(tag); if (taglen > cbp->cb_max_taglen) cbp->cb_max_taglen = taglen; } return (nvlist_add_nvlist(top_nvl, zname, nvl)); } /* * zfs holds [-rH] ... * * -r Lists holds that are set on the named snapshots recursively. * -H Scripted mode; elide headers and separate columns by tabs. */ static int zfs_do_holds(int argc, char **argv) { int c; boolean_t errors = B_FALSE; boolean_t scripted = B_FALSE; boolean_t recursive = B_FALSE; int types = ZFS_TYPE_SNAPSHOT; holds_cbdata_t cb = { 0 }; int limit = 0; int ret = 0; int flags = 0; /* check options */ while ((c = getopt(argc, argv, "rH")) != -1) { switch (c) { case 'r': recursive = B_TRUE; break; case 'H': scripted = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } if (recursive) { types |= ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME; flags |= ZFS_ITER_RECURSE; } argc -= optind; argv += optind; /* check number of arguments */ if (argc < 1) usage(B_FALSE); nvlist_t *nvl = fnvlist_alloc(); for (int i = 0; i < argc; ++i) { char *snapshot = argv[i]; const char *delim; const char *snapname; delim = strchr(snapshot, '@'); if (delim == NULL) { (void) fprintf(stderr, gettext("'%s' is not a snapshot\n"), snapshot); errors = B_TRUE; continue; } snapname = delim + 1; if (recursive) snapshot[delim - snapshot] = '\0'; cb.cb_recursive = recursive; cb.cb_snapname = snapname; cb.cb_nvlp = &nvl; /* * 1. collect holds data, set format options */ ret = zfs_for_each(1, argv + i, flags, types, NULL, NULL, limit, holds_callback, &cb); if (ret != 0) errors = B_TRUE; } /* * 2. print holds data */ print_holds(scripted, cb.cb_max_namelen, cb.cb_max_taglen, nvl); if (nvlist_empty(nvl)) (void) fprintf(stderr, gettext("no datasets available\n")); nvlist_free(nvl); return (errors); } #define CHECK_SPINNER 30 #define SPINNER_TIME 3 /* seconds */ #define MOUNT_TIME 1 /* seconds */ typedef struct get_all_state { boolean_t ga_verbose; get_all_cb_t *ga_cbp; } get_all_state_t; static int get_one_dataset(zfs_handle_t *zhp, void *data) { static const char *const spin[] = { "-", "\\", "|", "/" }; static int spinval = 0; static int spincheck = 0; static time_t last_spin_time = (time_t)0; get_all_state_t *state = data; zfs_type_t type = zfs_get_type(zhp); if (state->ga_verbose) { if (--spincheck < 0) { time_t now = time(NULL); if (last_spin_time + SPINNER_TIME < now) { update_progress(spin[spinval++ % 4]); last_spin_time = now; } spincheck = CHECK_SPINNER; } } /* * Iterate over any nested datasets. */ if (zfs_iter_filesystems(zhp, get_one_dataset, data) != 0) { zfs_close(zhp); return (1); } /* * Skip any datasets whose type does not match. */ if ((type & ZFS_TYPE_FILESYSTEM) == 0) { zfs_close(zhp); return (0); } libzfs_add_handle(state->ga_cbp, zhp); assert(state->ga_cbp->cb_used <= state->ga_cbp->cb_alloc); return (0); } static void get_all_datasets(get_all_cb_t *cbp, boolean_t verbose) { get_all_state_t state = { .ga_verbose = verbose, .ga_cbp = cbp }; if (verbose) set_progress_header(gettext("Reading ZFS config")); (void) zfs_iter_root(g_zfs, get_one_dataset, &state); if (verbose) finish_progress(gettext("done.")); } /* * Generic callback for sharing or mounting filesystems. Because the code is so * similar, we have a common function with an extra parameter to determine which * mode we are using. */ typedef enum { OP_SHARE, OP_MOUNT } share_mount_op_t; typedef struct share_mount_state { share_mount_op_t sm_op; boolean_t sm_verbose; int sm_flags; char *sm_options; enum sa_protocol sm_proto; /* only valid for OP_SHARE */ pthread_mutex_t sm_lock; /* protects the remaining fields */ uint_t sm_total; /* number of filesystems to process */ uint_t sm_done; /* number of filesystems processed */ int sm_status; /* -1 if any of the share/mount operations failed */ } share_mount_state_t; /* * Share or mount a dataset. */ static int share_mount_one(zfs_handle_t *zhp, int op, int flags, enum sa_protocol protocol, boolean_t explicit, const char *options) { char mountpoint[ZFS_MAXPROPLEN]; char shareopts[ZFS_MAXPROPLEN]; char smbshareopts[ZFS_MAXPROPLEN]; const char *cmdname = op == OP_SHARE ? "share" : "mount"; struct mnttab mnt; uint64_t zoned, canmount; boolean_t shared_nfs, shared_smb; assert(zfs_get_type(zhp) & ZFS_TYPE_FILESYSTEM); /* * Check to make sure we can mount/share this dataset. If we * are in the global zone and the filesystem is exported to a * local zone, or if we are in a local zone and the * filesystem is not exported, then it is an error. */ zoned = zfs_prop_get_int(zhp, ZFS_PROP_ZONED); if (zoned && getzoneid() == GLOBAL_ZONEID) { if (!explicit) return (0); (void) fprintf(stderr, gettext("cannot %s '%s': " "dataset is exported to a local zone\n"), cmdname, zfs_get_name(zhp)); return (1); } else if (!zoned && getzoneid() != GLOBAL_ZONEID) { if (!explicit) return (0); (void) fprintf(stderr, gettext("cannot %s '%s': " "permission denied\n"), cmdname, zfs_get_name(zhp)); return (1); } /* * Ignore any filesystems which don't apply to us. This * includes those with a legacy mountpoint, or those with * legacy share options. */ verify(zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, mountpoint, sizeof (mountpoint), NULL, NULL, 0, B_FALSE) == 0); verify(zfs_prop_get(zhp, ZFS_PROP_SHARENFS, shareopts, sizeof (shareopts), NULL, NULL, 0, B_FALSE) == 0); verify(zfs_prop_get(zhp, ZFS_PROP_SHARESMB, smbshareopts, sizeof (smbshareopts), NULL, NULL, 0, B_FALSE) == 0); if (op == OP_SHARE && strcmp(shareopts, "off") == 0 && strcmp(smbshareopts, "off") == 0) { if (!explicit) return (0); (void) fprintf(stderr, gettext("cannot share '%s': " "legacy share\n"), zfs_get_name(zhp)); (void) fprintf(stderr, gettext("use exports(5) or " "smb.conf(5) to share this filesystem, or set " "the sharenfs or sharesmb property\n")); return (1); } /* * We cannot share or mount legacy filesystems. If the * shareopts is non-legacy but the mountpoint is legacy, we * treat it as a legacy share. */ if (strcmp(mountpoint, "legacy") == 0) { if (!explicit) return (0); (void) fprintf(stderr, gettext("cannot %s '%s': " "legacy mountpoint\n"), cmdname, zfs_get_name(zhp)); (void) fprintf(stderr, gettext("use %s(8) to " "%s this filesystem\n"), cmdname, cmdname); return (1); } if (strcmp(mountpoint, "none") == 0) { if (!explicit) return (0); (void) fprintf(stderr, gettext("cannot %s '%s': no " "mountpoint set\n"), cmdname, zfs_get_name(zhp)); return (1); } /* * canmount explicit outcome * on no pass through * on yes pass through * off no return 0 * off yes display error, return 1 * noauto no return 0 * noauto yes pass through */ canmount = zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT); if (canmount == ZFS_CANMOUNT_OFF) { if (!explicit) return (0); (void) fprintf(stderr, gettext("cannot %s '%s': " "'canmount' property is set to 'off'\n"), cmdname, zfs_get_name(zhp)); return (1); } else if (canmount == ZFS_CANMOUNT_NOAUTO && !explicit) { /* * When performing a 'zfs mount -a', we skip any mounts for * datasets that have 'noauto' set. Sharing a dataset with * 'noauto' set is only allowed if it's mounted. */ if (op == OP_MOUNT) return (0); if (op == OP_SHARE && !zfs_is_mounted(zhp, NULL)) { /* also purge it from existing exports */ zfs_unshare(zhp, mountpoint, NULL); return (0); } } /* * If this filesystem is encrypted and does not have * a loaded key, we can not mount it. */ if ((flags & MS_CRYPT) == 0 && zfs_prop_get_int(zhp, ZFS_PROP_ENCRYPTION) != ZIO_CRYPT_OFF && zfs_prop_get_int(zhp, ZFS_PROP_KEYSTATUS) == ZFS_KEYSTATUS_UNAVAILABLE) { if (!explicit) return (0); (void) fprintf(stderr, gettext("cannot %s '%s': " "encryption key not loaded\n"), cmdname, zfs_get_name(zhp)); return (1); } /* * If this filesystem is inconsistent and has a receive resume * token, we can not mount it. */ if (zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT) && zfs_prop_get(zhp, ZFS_PROP_RECEIVE_RESUME_TOKEN, NULL, 0, NULL, NULL, 0, B_TRUE) == 0) { if (!explicit) return (0); (void) fprintf(stderr, gettext("cannot %s '%s': " "Contains partially-completed state from " "\"zfs receive -s\", which can be resumed with " "\"zfs send -t\"\n"), cmdname, zfs_get_name(zhp)); return (1); } if (zfs_prop_get_int(zhp, ZFS_PROP_REDACTED) && !(flags & MS_FORCE)) { if (!explicit) return (0); (void) fprintf(stderr, gettext("cannot %s '%s': " "Dataset is not complete, was created by receiving " "a redacted zfs send stream.\n"), cmdname, zfs_get_name(zhp)); return (1); } /* * At this point, we have verified that the mountpoint and/or * shareopts are appropriate for auto management. If the * filesystem is already mounted or shared, return (failing * for explicit requests); otherwise mount or share the * filesystem. */ switch (op) { case OP_SHARE: { enum sa_protocol prot[] = {SA_PROTOCOL_NFS, SA_NO_PROTOCOL}; shared_nfs = zfs_is_shared(zhp, NULL, prot); *prot = SA_PROTOCOL_SMB; shared_smb = zfs_is_shared(zhp, NULL, prot); if ((shared_nfs && shared_smb) || (shared_nfs && strcmp(shareopts, "on") == 0 && strcmp(smbshareopts, "off") == 0) || (shared_smb && strcmp(smbshareopts, "on") == 0 && strcmp(shareopts, "off") == 0)) { if (!explicit) return (0); (void) fprintf(stderr, gettext("cannot share " "'%s': filesystem already shared\n"), zfs_get_name(zhp)); return (1); } if (!zfs_is_mounted(zhp, NULL) && zfs_mount(zhp, NULL, flags) != 0) return (1); *prot = protocol; if (zfs_share(zhp, protocol == SA_NO_PROTOCOL ? NULL : prot)) return (1); } break; case OP_MOUNT: mnt.mnt_mntopts = (char *)(options ?: ""); if (!hasmntopt(&mnt, MNTOPT_REMOUNT) && zfs_is_mounted(zhp, NULL)) { if (!explicit) return (0); (void) fprintf(stderr, gettext("cannot mount " "'%s': filesystem already mounted\n"), zfs_get_name(zhp)); return (1); } if (zfs_mount(zhp, options, flags) != 0) return (1); break; } return (0); } /* * Reports progress in the form "(current/total)". Not thread-safe. */ static void report_mount_progress(int current, int total) { static time_t last_progress_time = 0; time_t now = time(NULL); char info[32]; /* display header if we're here for the first time */ if (current == 1) { set_progress_header(gettext("Mounting ZFS filesystems")); } else if (current != total && last_progress_time + MOUNT_TIME >= now) { /* too soon to report again */ return; } last_progress_time = now; (void) sprintf(info, "(%d/%d)", current, total); if (current == total) finish_progress(info); else update_progress(info); } /* * zfs_foreach_mountpoint() callback that mounts or shares one filesystem and * updates the progress meter. */ static int share_mount_one_cb(zfs_handle_t *zhp, void *arg) { share_mount_state_t *sms = arg; int ret; ret = share_mount_one(zhp, sms->sm_op, sms->sm_flags, sms->sm_proto, B_FALSE, sms->sm_options); pthread_mutex_lock(&sms->sm_lock); if (ret != 0) sms->sm_status = ret; sms->sm_done++; if (sms->sm_verbose) report_mount_progress(sms->sm_done, sms->sm_total); pthread_mutex_unlock(&sms->sm_lock); return (ret); } static void append_options(char *mntopts, char *newopts) { int len = strlen(mntopts); /* original length plus new string to append plus 1 for the comma */ if (len + 1 + strlen(newopts) >= MNT_LINE_MAX) { (void) fprintf(stderr, gettext("the opts argument for " "'%s' option is too long (more than %d chars)\n"), "-o", MNT_LINE_MAX); usage(B_FALSE); } if (*mntopts) mntopts[len++] = ','; (void) strcpy(&mntopts[len], newopts); } static enum sa_protocol sa_protocol_decode(const char *protocol) { for (enum sa_protocol i = 0; i < ARRAY_SIZE(sa_protocol_names); ++i) if (strcmp(protocol, sa_protocol_names[i]) == 0) return (i); (void) fputs(gettext("share type must be one of: "), stderr); for (enum sa_protocol i = 0; i < ARRAY_SIZE(sa_protocol_names); ++i) (void) fprintf(stderr, "%s%s", i != 0 ? ", " : "", sa_protocol_names[i]); (void) fputc('\n', stderr); usage(B_FALSE); } static int share_mount(int op, int argc, char **argv) { int do_all = 0; boolean_t verbose = B_FALSE; int c, ret = 0; char *options = NULL; int flags = 0; /* check options */ while ((c = getopt(argc, argv, op == OP_MOUNT ? ":alvo:Of" : "al")) != -1) { switch (c) { case 'a': do_all = 1; break; case 'v': verbose = B_TRUE; break; case 'l': flags |= MS_CRYPT; break; case 'o': if (*optarg == '\0') { (void) fprintf(stderr, gettext("empty mount " "options (-o) specified\n")); usage(B_FALSE); } if (options == NULL) options = safe_malloc(MNT_LINE_MAX + 1); /* option validation is done later */ append_options(options, optarg); break; case 'O': flags |= MS_OVERLAY; break; case 'f': flags |= MS_FORCE; break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); usage(B_FALSE); break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* check number of arguments */ if (do_all) { enum sa_protocol protocol = SA_NO_PROTOCOL; if (op == OP_SHARE && argc > 0) { protocol = sa_protocol_decode(argv[0]); argc--; argv++; } if (argc != 0) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } start_progress_timer(); get_all_cb_t cb = { 0 }; get_all_datasets(&cb, verbose); if (cb.cb_used == 0) { free(options); return (0); } share_mount_state_t share_mount_state = { 0 }; share_mount_state.sm_op = op; share_mount_state.sm_verbose = verbose; share_mount_state.sm_flags = flags; share_mount_state.sm_options = options; share_mount_state.sm_proto = protocol; share_mount_state.sm_total = cb.cb_used; pthread_mutex_init(&share_mount_state.sm_lock, NULL); /* For a 'zfs share -a' operation start with a clean slate. */ zfs_truncate_shares(NULL); /* * libshare isn't mt-safe, so only do the operation in parallel * if we're mounting. Additionally, the key-loading option must * be serialized so that we can prompt the user for their keys * in a consistent manner. */ zfs_foreach_mountpoint(g_zfs, cb.cb_handles, cb.cb_used, share_mount_one_cb, &share_mount_state, op == OP_MOUNT && !(flags & MS_CRYPT)); zfs_commit_shares(NULL); ret = share_mount_state.sm_status; for (int i = 0; i < cb.cb_used; i++) zfs_close(cb.cb_handles[i]); free(cb.cb_handles); } else if (argc == 0) { FILE *mnttab; struct mnttab entry; if ((op == OP_SHARE) || (options != NULL)) { (void) fprintf(stderr, gettext("missing filesystem " "argument (specify -a for all)\n")); usage(B_FALSE); } /* * When mount is given no arguments, go through * /proc/self/mounts and display any active ZFS mounts. * We hide any snapshots, since they are controlled * automatically. */ if ((mnttab = fopen(MNTTAB, "re")) == NULL) { free(options); return (ENOENT); } while (getmntent(mnttab, &entry) == 0) { if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0 || strchr(entry.mnt_special, '@') != NULL) continue; (void) printf("%-30s %s\n", entry.mnt_special, entry.mnt_mountp); } (void) fclose(mnttab); } else { zfs_handle_t *zhp; if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } if ((zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_FILESYSTEM)) == NULL) { ret = 1; } else { ret = share_mount_one(zhp, op, flags, SA_NO_PROTOCOL, B_TRUE, options); zfs_commit_shares(NULL); zfs_close(zhp); } } free(options); return (ret); } /* * zfs mount -a * zfs mount filesystem * * Mount all filesystems, or mount the given filesystem. */ static int zfs_do_mount(int argc, char **argv) { return (share_mount(OP_MOUNT, argc, argv)); } /* * zfs share -a [nfs | smb] * zfs share filesystem * * Share all filesystems, or share the given filesystem. */ static int zfs_do_share(int argc, char **argv) { return (share_mount(OP_SHARE, argc, argv)); } typedef struct unshare_unmount_node { zfs_handle_t *un_zhp; char *un_mountp; uu_avl_node_t un_avlnode; } unshare_unmount_node_t; static int unshare_unmount_compare(const void *larg, const void *rarg, void *unused) { (void) unused; const unshare_unmount_node_t *l = larg; const unshare_unmount_node_t *r = rarg; return (strcmp(l->un_mountp, r->un_mountp)); } /* * Convenience routine used by zfs_do_umount() and manual_unmount(). Given an * absolute path, find the entry /proc/self/mounts, verify that it's a * ZFS filesystem, and unmount it appropriately. */ static int unshare_unmount_path(int op, char *path, int flags, boolean_t is_manual) { zfs_handle_t *zhp; int ret = 0; struct stat64 statbuf; struct extmnttab entry; const char *cmdname = (op == OP_SHARE) ? "unshare" : "unmount"; ino_t path_inode; /* * Search for the given (major,minor) pair in the mount table. */ if (getextmntent(path, &entry, &statbuf) != 0) { if (op == OP_SHARE) { (void) fprintf(stderr, gettext("cannot %s '%s': not " "currently mounted\n"), cmdname, path); return (1); } (void) fprintf(stderr, gettext("warning: %s not in" "/proc/self/mounts\n"), path); if ((ret = umount2(path, flags)) != 0) (void) fprintf(stderr, gettext("%s: %s\n"), path, strerror(errno)); return (ret != 0); } path_inode = statbuf.st_ino; if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0) { (void) fprintf(stderr, gettext("cannot %s '%s': not a ZFS " "filesystem\n"), cmdname, path); return (1); } if ((zhp = zfs_open(g_zfs, entry.mnt_special, ZFS_TYPE_FILESYSTEM)) == NULL) return (1); ret = 1; if (stat64(entry.mnt_mountp, &statbuf) != 0) { (void) fprintf(stderr, gettext("cannot %s '%s': %s\n"), cmdname, path, strerror(errno)); goto out; } else if (statbuf.st_ino != path_inode) { (void) fprintf(stderr, gettext("cannot " "%s '%s': not a mountpoint\n"), cmdname, path); goto out; } if (op == OP_SHARE) { char nfs_mnt_prop[ZFS_MAXPROPLEN]; char smbshare_prop[ZFS_MAXPROPLEN]; verify(zfs_prop_get(zhp, ZFS_PROP_SHARENFS, nfs_mnt_prop, sizeof (nfs_mnt_prop), NULL, NULL, 0, B_FALSE) == 0); verify(zfs_prop_get(zhp, ZFS_PROP_SHARESMB, smbshare_prop, sizeof (smbshare_prop), NULL, NULL, 0, B_FALSE) == 0); if (strcmp(nfs_mnt_prop, "off") == 0 && strcmp(smbshare_prop, "off") == 0) { (void) fprintf(stderr, gettext("cannot unshare " "'%s': legacy share\n"), path); (void) fprintf(stderr, gettext("use exportfs(8) " "or smbcontrol(1) to unshare this filesystem\n")); } else if (!zfs_is_shared(zhp, NULL, NULL)) { (void) fprintf(stderr, gettext("cannot unshare '%s': " "not currently shared\n"), path); } else { ret = zfs_unshare(zhp, path, NULL); zfs_commit_shares(NULL); } } else { char mtpt_prop[ZFS_MAXPROPLEN]; verify(zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, mtpt_prop, sizeof (mtpt_prop), NULL, NULL, 0, B_FALSE) == 0); if (is_manual) { ret = zfs_unmount(zhp, NULL, flags); } else if (strcmp(mtpt_prop, "legacy") == 0) { (void) fprintf(stderr, gettext("cannot unmount " "'%s': legacy mountpoint\n"), zfs_get_name(zhp)); (void) fprintf(stderr, gettext("use umount(8) " "to unmount this filesystem\n")); } else { ret = zfs_unmountall(zhp, flags); } } out: zfs_close(zhp); return (ret != 0); } /* * Generic callback for unsharing or unmounting a filesystem. */ static int unshare_unmount(int op, int argc, char **argv) { int do_all = 0; int flags = 0; int ret = 0; int c; zfs_handle_t *zhp; char nfs_mnt_prop[ZFS_MAXPROPLEN]; char sharesmb[ZFS_MAXPROPLEN]; /* check options */ while ((c = getopt(argc, argv, op == OP_SHARE ? ":a" : "afu")) != -1) { switch (c) { case 'a': do_all = 1; break; case 'f': flags |= MS_FORCE; break; case 'u': flags |= MS_CRYPT; break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); usage(B_FALSE); break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; if (do_all) { /* * We could make use of zfs_for_each() to walk all datasets in * the system, but this would be very inefficient, especially * since we would have to linearly search /proc/self/mounts for * each one. Instead, do one pass through /proc/self/mounts * looking for zfs entries and call zfs_unmount() for each one. * * Things get a little tricky if the administrator has created * mountpoints beneath other ZFS filesystems. In this case, we * have to unmount the deepest filesystems first. To accomplish * this, we place all the mountpoints in an AVL tree sorted by * the special type (dataset name), and walk the result in * reverse to make sure to get any snapshots first. */ FILE *mnttab; struct mnttab entry; uu_avl_pool_t *pool; uu_avl_t *tree = NULL; unshare_unmount_node_t *node; uu_avl_index_t idx; uu_avl_walk_t *walk; enum sa_protocol *protocol = NULL, single_protocol[] = {SA_NO_PROTOCOL, SA_NO_PROTOCOL}; if (op == OP_SHARE && argc > 0) { *single_protocol = sa_protocol_decode(argv[0]); protocol = single_protocol; argc--; argv++; } if (argc != 0) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } if (((pool = uu_avl_pool_create("unmount_pool", sizeof (unshare_unmount_node_t), offsetof(unshare_unmount_node_t, un_avlnode), unshare_unmount_compare, UU_DEFAULT)) == NULL) || ((tree = uu_avl_create(pool, NULL, UU_DEFAULT)) == NULL)) nomem(); if ((mnttab = fopen(MNTTAB, "re")) == NULL) { uu_avl_destroy(tree); uu_avl_pool_destroy(pool); return (ENOENT); } while (getmntent(mnttab, &entry) == 0) { /* ignore non-ZFS entries */ if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0) continue; /* ignore snapshots */ if (strchr(entry.mnt_special, '@') != NULL) continue; if ((zhp = zfs_open(g_zfs, entry.mnt_special, ZFS_TYPE_FILESYSTEM)) == NULL) { ret = 1; continue; } /* * Ignore datasets that are excluded/restricted by * parent pool name. */ if (zpool_skip_pool(zfs_get_pool_name(zhp))) { zfs_close(zhp); continue; } switch (op) { case OP_SHARE: verify(zfs_prop_get(zhp, ZFS_PROP_SHARENFS, nfs_mnt_prop, sizeof (nfs_mnt_prop), NULL, NULL, 0, B_FALSE) == 0); if (strcmp(nfs_mnt_prop, "off") != 0) break; verify(zfs_prop_get(zhp, ZFS_PROP_SHARESMB, nfs_mnt_prop, sizeof (nfs_mnt_prop), NULL, NULL, 0, B_FALSE) == 0); if (strcmp(nfs_mnt_prop, "off") == 0) continue; break; case OP_MOUNT: /* Ignore legacy mounts */ verify(zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, nfs_mnt_prop, sizeof (nfs_mnt_prop), NULL, NULL, 0, B_FALSE) == 0); if (strcmp(nfs_mnt_prop, "legacy") == 0) continue; /* Ignore canmount=noauto mounts */ if (zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT) == ZFS_CANMOUNT_NOAUTO) continue; break; default: break; } node = safe_malloc(sizeof (unshare_unmount_node_t)); node->un_zhp = zhp; node->un_mountp = safe_strdup(entry.mnt_mountp); uu_avl_node_init(node, &node->un_avlnode, pool); if (uu_avl_find(tree, node, NULL, &idx) == NULL) { uu_avl_insert(tree, node, idx); } else { zfs_close(node->un_zhp); free(node->un_mountp); free(node); } } (void) fclose(mnttab); /* * Walk the AVL tree in reverse, unmounting each filesystem and * removing it from the AVL tree in the process. */ if ((walk = uu_avl_walk_start(tree, UU_WALK_REVERSE | UU_WALK_ROBUST)) == NULL) nomem(); while ((node = uu_avl_walk_next(walk)) != NULL) { const char *mntarg = NULL; uu_avl_remove(tree, node); switch (op) { case OP_SHARE: if (zfs_unshare(node->un_zhp, node->un_mountp, protocol) != 0) ret = 1; break; case OP_MOUNT: if (zfs_unmount(node->un_zhp, mntarg, flags) != 0) ret = 1; break; } zfs_close(node->un_zhp); free(node->un_mountp); free(node); } if (op == OP_SHARE) zfs_commit_shares(protocol); uu_avl_walk_end(walk); uu_avl_destroy(tree); uu_avl_pool_destroy(pool); } else { if (argc != 1) { if (argc == 0) (void) fprintf(stderr, gettext("missing filesystem argument\n")); else (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } /* * We have an argument, but it may be a full path or a ZFS * filesystem. Pass full paths off to unmount_path() (shared by * manual_unmount), otherwise open the filesystem and pass to * zfs_unmount(). */ if (argv[0][0] == '/') return (unshare_unmount_path(op, argv[0], flags, B_FALSE)); if ((zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_FILESYSTEM)) == NULL) return (1); verify(zfs_prop_get(zhp, op == OP_SHARE ? ZFS_PROP_SHARENFS : ZFS_PROP_MOUNTPOINT, nfs_mnt_prop, sizeof (nfs_mnt_prop), NULL, NULL, 0, B_FALSE) == 0); switch (op) { case OP_SHARE: verify(zfs_prop_get(zhp, ZFS_PROP_SHARENFS, nfs_mnt_prop, sizeof (nfs_mnt_prop), NULL, NULL, 0, B_FALSE) == 0); verify(zfs_prop_get(zhp, ZFS_PROP_SHARESMB, sharesmb, sizeof (sharesmb), NULL, NULL, 0, B_FALSE) == 0); if (strcmp(nfs_mnt_prop, "off") == 0 && strcmp(sharesmb, "off") == 0) { (void) fprintf(stderr, gettext("cannot " "unshare '%s': legacy share\n"), zfs_get_name(zhp)); (void) fprintf(stderr, gettext("use " "exports(5) or smb.conf(5) to unshare " "this filesystem\n")); ret = 1; } else if (!zfs_is_shared(zhp, NULL, NULL)) { (void) fprintf(stderr, gettext("cannot " "unshare '%s': not currently " "shared\n"), zfs_get_name(zhp)); ret = 1; } else if (zfs_unshareall(zhp, NULL) != 0) { ret = 1; } break; case OP_MOUNT: if (strcmp(nfs_mnt_prop, "legacy") == 0) { (void) fprintf(stderr, gettext("cannot " "unmount '%s': legacy " "mountpoint\n"), zfs_get_name(zhp)); (void) fprintf(stderr, gettext("use " "umount(8) to unmount this " "filesystem\n")); ret = 1; } else if (!zfs_is_mounted(zhp, NULL)) { (void) fprintf(stderr, gettext("cannot " "unmount '%s': not currently " "mounted\n"), zfs_get_name(zhp)); ret = 1; } else if (zfs_unmountall(zhp, flags) != 0) { ret = 1; } break; } zfs_close(zhp); } return (ret); } /* * zfs unmount [-fu] -a * zfs unmount [-fu] filesystem * * Unmount all filesystems, or a specific ZFS filesystem. */ static int zfs_do_unmount(int argc, char **argv) { return (unshare_unmount(OP_MOUNT, argc, argv)); } /* * zfs unshare -a * zfs unshare filesystem * * Unshare all filesystems, or a specific ZFS filesystem. */ static int zfs_do_unshare(int argc, char **argv) { return (unshare_unmount(OP_SHARE, argc, argv)); } static int find_command_idx(const char *command, int *idx) { int i; for (i = 0; i < NCOMMAND; i++) { if (command_table[i].name == NULL) continue; if (strcmp(command, command_table[i].name) == 0) { *idx = i; return (0); } } return (1); } static int zfs_do_diff(int argc, char **argv) { zfs_handle_t *zhp; int flags = 0; char *tosnap = NULL; char *fromsnap = NULL; char *atp, *copy; int err = 0; int c; struct sigaction sa; while ((c = getopt(argc, argv, "FHth")) != -1) { switch (c) { case 'F': flags |= ZFS_DIFF_CLASSIFY; break; case 'H': flags |= ZFS_DIFF_PARSEABLE; break; case 't': flags |= ZFS_DIFF_TIMESTAMP; break; case 'h': flags |= ZFS_DIFF_NO_MANGLE; break; default: (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; if (argc < 1) { (void) fprintf(stderr, gettext("must provide at least one snapshot name\n")); usage(B_FALSE); } if (argc > 2) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } fromsnap = argv[0]; tosnap = (argc == 2) ? argv[1] : NULL; copy = NULL; if (*fromsnap != '@') copy = strdup(fromsnap); else if (tosnap) copy = strdup(tosnap); if (copy == NULL) usage(B_FALSE); if ((atp = strchr(copy, '@')) != NULL) *atp = '\0'; if ((zhp = zfs_open(g_zfs, copy, ZFS_TYPE_FILESYSTEM)) == NULL) { free(copy); return (1); } free(copy); /* * Ignore SIGPIPE so that the library can give us * information on any failure */ if (sigemptyset(&sa.sa_mask) == -1) { err = errno; goto out; } sa.sa_flags = 0; sa.sa_handler = SIG_IGN; if (sigaction(SIGPIPE, &sa, NULL) == -1) { err = errno; goto out; } err = zfs_show_diffs(zhp, STDOUT_FILENO, fromsnap, tosnap, flags); out: zfs_close(zhp); return (err != 0); } /* * zfs bookmark | * * Creates a bookmark with the given name from the source snapshot * or creates a copy of an existing source bookmark. */ static int zfs_do_bookmark(int argc, char **argv) { char *source, *bookname; char expbuf[ZFS_MAX_DATASET_NAME_LEN]; int source_type; nvlist_t *nvl; int ret = 0; int c; /* check options */ while ((c = getopt(argc, argv, "")) != -1) { switch (c) { case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); goto usage; } } argc -= optind; argv += optind; /* check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing source argument\n")); goto usage; } if (argc < 2) { (void) fprintf(stderr, gettext("missing bookmark argument\n")); goto usage; } source = argv[0]; bookname = argv[1]; if (strchr(source, '@') == NULL && strchr(source, '#') == NULL) { (void) fprintf(stderr, gettext("invalid source name '%s': " "must contain a '@' or '#'\n"), source); goto usage; } if (strchr(bookname, '#') == NULL) { (void) fprintf(stderr, gettext("invalid bookmark name '%s': " "must contain a '#'\n"), bookname); goto usage; } /* * expand source or bookname to full path: * one of them may be specified as short name */ { char **expand; char *source_short, *bookname_short; source_short = strpbrk(source, "@#"); bookname_short = strpbrk(bookname, "#"); if (source_short == source && bookname_short == bookname) { (void) fprintf(stderr, gettext( "either source or bookmark must be specified as " "full dataset paths")); goto usage; } else if (source_short != source && bookname_short != bookname) { expand = NULL; } else if (source_short != source) { strlcpy(expbuf, source, sizeof (expbuf)); expand = &bookname; } else if (bookname_short != bookname) { strlcpy(expbuf, bookname, sizeof (expbuf)); expand = &source; } else { abort(); } if (expand != NULL) { *strpbrk(expbuf, "@#") = '\0'; /* dataset name in buf */ (void) strlcat(expbuf, *expand, sizeof (expbuf)); *expand = expbuf; } } /* determine source type */ switch (*strpbrk(source, "@#")) { case '@': source_type = ZFS_TYPE_SNAPSHOT; break; case '#': source_type = ZFS_TYPE_BOOKMARK; break; default: abort(); } /* test the source exists */ zfs_handle_t *zhp; zhp = zfs_open(g_zfs, source, source_type); if (zhp == NULL) goto usage; zfs_close(zhp); nvl = fnvlist_alloc(); fnvlist_add_string(nvl, bookname, source); ret = lzc_bookmark(nvl, NULL); fnvlist_free(nvl); if (ret != 0) { const char *err_msg = NULL; char errbuf[1024]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot create bookmark '%s'"), bookname); switch (ret) { case EXDEV: err_msg = "bookmark is in a different pool"; break; case ZFS_ERR_BOOKMARK_SOURCE_NOT_ANCESTOR: err_msg = "source is not an ancestor of the " "new bookmark's dataset"; break; case EEXIST: err_msg = "bookmark exists"; break; case EINVAL: err_msg = "invalid argument"; break; case ENOTSUP: err_msg = "bookmark feature not enabled"; break; case ENOSPC: err_msg = "out of space"; break; case ENOENT: err_msg = "dataset does not exist"; break; default: (void) zfs_standard_error(g_zfs, ret, errbuf); break; } if (err_msg != NULL) { (void) fprintf(stderr, "%s: %s\n", errbuf, dgettext(TEXT_DOMAIN, err_msg)); } } return (ret != 0); usage: usage(B_FALSE); return (-1); } static int zfs_do_channel_program(int argc, char **argv) { int ret, fd, c; size_t progsize, progread; nvlist_t *outnvl = NULL; uint64_t instrlimit = ZCP_DEFAULT_INSTRLIMIT; uint64_t memlimit = ZCP_DEFAULT_MEMLIMIT; boolean_t sync_flag = B_TRUE, json_output = B_FALSE; zpool_handle_t *zhp; /* check options */ while ((c = getopt(argc, argv, "nt:m:j")) != -1) { switch (c) { case 't': case 'm': { uint64_t arg; char *endp; errno = 0; arg = strtoull(optarg, &endp, 0); if (errno != 0 || *endp != '\0') { (void) fprintf(stderr, gettext( "invalid argument " "'%s': expected integer\n"), optarg); goto usage; } if (c == 't') { instrlimit = arg; } else { ASSERT3U(c, ==, 'm'); memlimit = arg; } break; } case 'n': { sync_flag = B_FALSE; break; } case 'j': { json_output = B_TRUE; break; } case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); goto usage; } } argc -= optind; argv += optind; if (argc < 2) { (void) fprintf(stderr, gettext("invalid number of arguments\n")); goto usage; } const char *poolname = argv[0]; const char *filename = argv[1]; if (strcmp(filename, "-") == 0) { fd = 0; filename = "standard input"; } else if ((fd = open(filename, O_RDONLY)) < 0) { (void) fprintf(stderr, gettext("cannot open '%s': %s\n"), filename, strerror(errno)); return (1); } if ((zhp = zpool_open(g_zfs, poolname)) == NULL) { (void) fprintf(stderr, gettext("cannot open pool '%s'\n"), poolname); if (fd != 0) (void) close(fd); return (1); } zpool_close(zhp); /* * Read in the channel program, expanding the program buffer as * necessary. */ progread = 0; progsize = 1024; char *progbuf = safe_malloc(progsize); do { ret = read(fd, progbuf + progread, progsize - progread); progread += ret; if (progread == progsize && ret > 0) { progsize *= 2; progbuf = safe_realloc(progbuf, progsize); } } while (ret > 0); if (fd != 0) (void) close(fd); if (ret < 0) { free(progbuf); (void) fprintf(stderr, gettext("cannot read '%s': %s\n"), filename, strerror(errno)); return (1); } progbuf[progread] = '\0'; /* * Any remaining arguments are passed as arguments to the lua script as * a string array: * { * "argv" -> [ "arg 1", ... "arg n" ], * } */ nvlist_t *argnvl = fnvlist_alloc(); fnvlist_add_string_array(argnvl, ZCP_ARG_CLIARGV, (const char **)argv + 2, argc - 2); if (sync_flag) { ret = lzc_channel_program(poolname, progbuf, instrlimit, memlimit, argnvl, &outnvl); } else { ret = lzc_channel_program_nosync(poolname, progbuf, instrlimit, memlimit, argnvl, &outnvl); } if (ret != 0) { /* * On error, report the error message handed back by lua if one * exists. Otherwise, generate an appropriate error message, * falling back on strerror() for an unexpected return code. */ const char *errstring = NULL; const char *msg = gettext("Channel program execution failed"); uint64_t instructions = 0; if (outnvl != NULL && nvlist_exists(outnvl, ZCP_RET_ERROR)) { char *es = NULL; (void) nvlist_lookup_string(outnvl, ZCP_RET_ERROR, &es); if (es == NULL) errstring = strerror(ret); else errstring = es; if (ret == ETIME) { (void) nvlist_lookup_uint64(outnvl, ZCP_ARG_INSTRLIMIT, &instructions); } } else { switch (ret) { case EINVAL: errstring = "Invalid instruction or memory limit."; break; case ENOMEM: errstring = "Return value too large."; break; case ENOSPC: errstring = "Memory limit exhausted."; break; case ETIME: errstring = "Timed out."; break; case EPERM: errstring = "Permission denied. Channel " "programs must be run as root."; break; default: (void) zfs_standard_error(g_zfs, ret, msg); } } if (errstring != NULL) (void) fprintf(stderr, "%s:\n%s\n", msg, errstring); if (ret == ETIME && instructions != 0) (void) fprintf(stderr, gettext("%llu Lua instructions\n"), (u_longlong_t)instructions); } else { if (json_output) { (void) nvlist_print_json(stdout, outnvl); } else if (nvlist_empty(outnvl)) { (void) fprintf(stdout, gettext("Channel program fully " "executed and did not produce output.\n")); } else { (void) fprintf(stdout, gettext("Channel program fully " "executed and produced output:\n")); dump_nvlist(outnvl, 4); } } free(progbuf); fnvlist_free(outnvl); fnvlist_free(argnvl); return (ret != 0); usage: usage(B_FALSE); return (-1); } typedef struct loadkey_cbdata { boolean_t cb_loadkey; boolean_t cb_recursive; boolean_t cb_noop; char *cb_keylocation; uint64_t cb_numfailed; uint64_t cb_numattempted; } loadkey_cbdata_t; static int load_key_callback(zfs_handle_t *zhp, void *data) { int ret; boolean_t is_encroot; loadkey_cbdata_t *cb = data; uint64_t keystatus = zfs_prop_get_int(zhp, ZFS_PROP_KEYSTATUS); /* * If we are working recursively, we want to skip loading / unloading * keys for non-encryption roots and datasets whose keys are already * in the desired end-state. */ if (cb->cb_recursive) { ret = zfs_crypto_get_encryption_root(zhp, &is_encroot, NULL); if (ret != 0) return (ret); if (!is_encroot) return (0); if ((cb->cb_loadkey && keystatus == ZFS_KEYSTATUS_AVAILABLE) || (!cb->cb_loadkey && keystatus == ZFS_KEYSTATUS_UNAVAILABLE)) return (0); } cb->cb_numattempted++; if (cb->cb_loadkey) ret = zfs_crypto_load_key(zhp, cb->cb_noop, cb->cb_keylocation); else ret = zfs_crypto_unload_key(zhp); if (ret != 0) { cb->cb_numfailed++; return (ret); } return (0); } static int load_unload_keys(int argc, char **argv, boolean_t loadkey) { int c, ret = 0, flags = 0; boolean_t do_all = B_FALSE; loadkey_cbdata_t cb = { 0 }; cb.cb_loadkey = loadkey; while ((c = getopt(argc, argv, "anrL:")) != -1) { /* noop and alternate keylocations only apply to zfs load-key */ if (loadkey) { switch (c) { case 'n': cb.cb_noop = B_TRUE; continue; case 'L': cb.cb_keylocation = optarg; continue; default: break; } } switch (c) { case 'a': do_all = B_TRUE; cb.cb_recursive = B_TRUE; break; case 'r': flags |= ZFS_ITER_RECURSE; cb.cb_recursive = B_TRUE; break; default: (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; if (!do_all && argc == 0) { (void) fprintf(stderr, gettext("Missing dataset argument or -a option\n")); usage(B_FALSE); } if (do_all && argc != 0) { (void) fprintf(stderr, gettext("Cannot specify dataset with -a option\n")); usage(B_FALSE); } if (cb.cb_recursive && cb.cb_keylocation != NULL && strcmp(cb.cb_keylocation, "prompt") != 0) { (void) fprintf(stderr, gettext("alternate keylocation may only " "be 'prompt' with -r or -a\n")); usage(B_FALSE); } ret = zfs_for_each(argc, argv, flags, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, NULL, NULL, 0, load_key_callback, &cb); if (cb.cb_noop || (cb.cb_recursive && cb.cb_numattempted != 0)) { (void) printf(gettext("%llu / %llu key(s) successfully %s\n"), (u_longlong_t)(cb.cb_numattempted - cb.cb_numfailed), (u_longlong_t)cb.cb_numattempted, loadkey ? (cb.cb_noop ? "verified" : "loaded") : "unloaded"); } if (cb.cb_numfailed != 0) ret = -1; return (ret); } static int zfs_do_load_key(int argc, char **argv) { return (load_unload_keys(argc, argv, B_TRUE)); } static int zfs_do_unload_key(int argc, char **argv) { return (load_unload_keys(argc, argv, B_FALSE)); } static int zfs_do_change_key(int argc, char **argv) { int c, ret; uint64_t keystatus; boolean_t loadkey = B_FALSE, inheritkey = B_FALSE; zfs_handle_t *zhp = NULL; nvlist_t *props = fnvlist_alloc(); while ((c = getopt(argc, argv, "lio:")) != -1) { switch (c) { case 'l': loadkey = B_TRUE; break; case 'i': inheritkey = B_TRUE; break; case 'o': if (!parseprop(props, optarg)) { nvlist_free(props); return (1); } break; default: (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } if (inheritkey && !nvlist_empty(props)) { (void) fprintf(stderr, gettext("Properties not allowed for inheriting\n")); usage(B_FALSE); } argc -= optind; argv += optind; if (argc < 1) { (void) fprintf(stderr, gettext("Missing dataset argument\n")); usage(B_FALSE); } if (argc > 1) { (void) fprintf(stderr, gettext("Too many arguments\n")); usage(B_FALSE); } zhp = zfs_open(g_zfs, argv[argc - 1], ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); if (zhp == NULL) usage(B_FALSE); if (loadkey) { keystatus = zfs_prop_get_int(zhp, ZFS_PROP_KEYSTATUS); if (keystatus != ZFS_KEYSTATUS_AVAILABLE) { ret = zfs_crypto_load_key(zhp, B_FALSE, NULL); if (ret != 0) { nvlist_free(props); zfs_close(zhp); return (-1); } } /* refresh the properties so the new keystatus is visible */ zfs_refresh_properties(zhp); } ret = zfs_crypto_rewrap(zhp, props, inheritkey); if (ret != 0) { nvlist_free(props); zfs_close(zhp); return (-1); } nvlist_free(props); zfs_close(zhp); return (0); } /* * 1) zfs project [-d|-r] * List project ID and inherit flag of file(s) or directories. * -d: List the directory itself, not its children. * -r: List subdirectories recursively. * * 2) zfs project -C [-k] [-r] * Clear project inherit flag and/or ID on the file(s) or directories. * -k: Keep the project ID unchanged. If not specified, the project ID * will be reset as zero. * -r: Clear on subdirectories recursively. * * 3) zfs project -c [-0] [-d|-r] [-p id] * Check project ID and inherit flag on the file(s) or directories, * report the outliers. * -0: Print file name followed by a NUL instead of newline. * -d: Check the directory itself, not its children. * -p: Specify the referenced ID for comparing with the target file(s) * or directories' project IDs. If not specified, the target (top) * directory's project ID will be used as the referenced one. * -r: Check subdirectories recursively. * * 4) zfs project [-p id] [-r] [-s] * Set project ID and/or inherit flag on the file(s) or directories. * -p: Set the project ID as the given id. * -r: Set on subdirectories recursively. If not specify "-p" option, * it will use top-level directory's project ID as the given id, * then set both project ID and inherit flag on all descendants * of the top-level directory. * -s: Set project inherit flag. */ static int zfs_do_project(int argc, char **argv) { zfs_project_control_t zpc = { .zpc_expected_projid = ZFS_INVALID_PROJID, .zpc_op = ZFS_PROJECT_OP_DEFAULT, .zpc_dironly = B_FALSE, .zpc_keep_projid = B_FALSE, .zpc_newline = B_TRUE, .zpc_recursive = B_FALSE, .zpc_set_flag = B_FALSE, }; int ret = 0, c; if (argc < 2) usage(B_FALSE); while ((c = getopt(argc, argv, "0Ccdkp:rs")) != -1) { switch (c) { case '0': zpc.zpc_newline = B_FALSE; break; case 'C': if (zpc.zpc_op != ZFS_PROJECT_OP_DEFAULT) { (void) fprintf(stderr, gettext("cannot " "specify '-C' '-c' '-s' together\n")); usage(B_FALSE); } zpc.zpc_op = ZFS_PROJECT_OP_CLEAR; break; case 'c': if (zpc.zpc_op != ZFS_PROJECT_OP_DEFAULT) { (void) fprintf(stderr, gettext("cannot " "specify '-C' '-c' '-s' together\n")); usage(B_FALSE); } zpc.zpc_op = ZFS_PROJECT_OP_CHECK; break; case 'd': zpc.zpc_dironly = B_TRUE; /* overwrite "-r" option */ zpc.zpc_recursive = B_FALSE; break; case 'k': zpc.zpc_keep_projid = B_TRUE; break; case 'p': { char *endptr; errno = 0; zpc.zpc_expected_projid = strtoull(optarg, &endptr, 0); if (errno != 0 || *endptr != '\0') { (void) fprintf(stderr, gettext("project ID must be less than " "%u\n"), UINT32_MAX); usage(B_FALSE); } if (zpc.zpc_expected_projid >= UINT32_MAX) { (void) fprintf(stderr, gettext("invalid project ID\n")); usage(B_FALSE); } break; } case 'r': zpc.zpc_recursive = B_TRUE; /* overwrite "-d" option */ zpc.zpc_dironly = B_FALSE; break; case 's': if (zpc.zpc_op != ZFS_PROJECT_OP_DEFAULT) { (void) fprintf(stderr, gettext("cannot " "specify '-C' '-c' '-s' together\n")); usage(B_FALSE); } zpc.zpc_set_flag = B_TRUE; zpc.zpc_op = ZFS_PROJECT_OP_SET; break; default: (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } if (zpc.zpc_op == ZFS_PROJECT_OP_DEFAULT) { if (zpc.zpc_expected_projid != ZFS_INVALID_PROJID) zpc.zpc_op = ZFS_PROJECT_OP_SET; else zpc.zpc_op = ZFS_PROJECT_OP_LIST; } switch (zpc.zpc_op) { case ZFS_PROJECT_OP_LIST: if (zpc.zpc_keep_projid) { (void) fprintf(stderr, gettext("'-k' is only valid together with '-C'\n")); usage(B_FALSE); } if (!zpc.zpc_newline) { (void) fprintf(stderr, gettext("'-0' is only valid together with '-c'\n")); usage(B_FALSE); } break; case ZFS_PROJECT_OP_CHECK: if (zpc.zpc_keep_projid) { (void) fprintf(stderr, gettext("'-k' is only valid together with '-C'\n")); usage(B_FALSE); } break; case ZFS_PROJECT_OP_CLEAR: if (zpc.zpc_dironly) { (void) fprintf(stderr, gettext("'-d' is useless together with '-C'\n")); usage(B_FALSE); } if (!zpc.zpc_newline) { (void) fprintf(stderr, gettext("'-0' is only valid together with '-c'\n")); usage(B_FALSE); } if (zpc.zpc_expected_projid != ZFS_INVALID_PROJID) { (void) fprintf(stderr, gettext("'-p' is useless together with '-C'\n")); usage(B_FALSE); } break; case ZFS_PROJECT_OP_SET: if (zpc.zpc_dironly) { (void) fprintf(stderr, gettext("'-d' is useless for set project ID and/or " "inherit flag\n")); usage(B_FALSE); } if (zpc.zpc_keep_projid) { (void) fprintf(stderr, gettext("'-k' is only valid together with '-C'\n")); usage(B_FALSE); } if (!zpc.zpc_newline) { (void) fprintf(stderr, gettext("'-0' is only valid together with '-c'\n")); usage(B_FALSE); } break; default: ASSERT(0); break; } argv += optind; argc -= optind; if (argc == 0) { (void) fprintf(stderr, gettext("missing file or directory target(s)\n")); usage(B_FALSE); } for (int i = 0; i < argc; i++) { int err; err = zfs_project_handle(argv[i], &zpc); if (err && !ret) ret = err; } return (ret); } static int zfs_do_wait(int argc, char **argv) { boolean_t enabled[ZFS_WAIT_NUM_ACTIVITIES]; int error, i; int c; /* By default, wait for all types of activity. */ for (i = 0; i < ZFS_WAIT_NUM_ACTIVITIES; i++) enabled[i] = B_TRUE; while ((c = getopt(argc, argv, "t:")) != -1) { switch (c) { case 't': /* Reset activities array */ memset(&enabled, 0, sizeof (enabled)); for (char *tok; (tok = strsep(&optarg, ",")); ) { static const char *const col_subopts[ ZFS_WAIT_NUM_ACTIVITIES] = { "deleteq" }; for (i = 0; i < ARRAY_SIZE(col_subopts); ++i) if (strcmp(tok, col_subopts[i]) == 0) { enabled[i] = B_TRUE; goto found; } (void) fprintf(stderr, gettext("invalid activity '%s'\n"), tok); usage(B_FALSE); found:; } break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argv += optind; argc -= optind; if (argc < 1) { (void) fprintf(stderr, gettext("missing 'filesystem' " "argument\n")); usage(B_FALSE); } if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } zfs_handle_t *zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_FILESYSTEM); if (zhp == NULL) return (1); for (;;) { boolean_t missing = B_FALSE; boolean_t any_waited = B_FALSE; for (int i = 0; i < ZFS_WAIT_NUM_ACTIVITIES; i++) { boolean_t waited; if (!enabled[i]) continue; error = zfs_wait_status(zhp, i, &missing, &waited); if (error != 0 || missing) break; any_waited = (any_waited || waited); } if (error != 0 || missing || !any_waited) break; } zfs_close(zhp); return (error); } /* * Display version message */ static int zfs_do_version(int argc, char **argv) { (void) argc, (void) argv; return (zfs_version_print() != 0); } int main(int argc, char **argv) { int ret = 0; int i = 0; const char *cmdname; char **newargv; (void) setlocale(LC_ALL, ""); (void) setlocale(LC_NUMERIC, "C"); (void) textdomain(TEXT_DOMAIN); opterr = 0; /* * Make sure the user has specified some command. */ if (argc < 2) { (void) fprintf(stderr, gettext("missing command\n")); usage(B_FALSE); } cmdname = argv[1]; /* * The 'umount' command is an alias for 'unmount' */ if (strcmp(cmdname, "umount") == 0) cmdname = "unmount"; /* * The 'recv' command is an alias for 'receive' */ if (strcmp(cmdname, "recv") == 0) cmdname = "receive"; /* * The 'snap' command is an alias for 'snapshot' */ if (strcmp(cmdname, "snap") == 0) cmdname = "snapshot"; /* * Special case '-?' */ if ((strcmp(cmdname, "-?") == 0) || (strcmp(cmdname, "--help") == 0)) usage(B_TRUE); /* * Special case '-V|--version' */ if ((strcmp(cmdname, "-V") == 0) || (strcmp(cmdname, "--version") == 0)) return (zfs_do_version(argc, argv)); if ((g_zfs = libzfs_init()) == NULL) { (void) fprintf(stderr, "%s\n", libzfs_error_init(errno)); return (1); } zfs_save_arguments(argc, argv, history_str, sizeof (history_str)); libzfs_print_on_error(g_zfs, B_TRUE); /* * Many commands modify input strings for string parsing reasons. * We create a copy to protect the original argv. */ newargv = safe_malloc((argc + 1) * sizeof (newargv[0])); for (i = 0; i < argc; i++) newargv[i] = strdup(argv[i]); newargv[argc] = NULL; /* * Run the appropriate command. */ libzfs_mnttab_cache(g_zfs, B_TRUE); if (find_command_idx(cmdname, &i) == 0) { current_command = &command_table[i]; ret = command_table[i].func(argc - 1, newargv + 1); } else if (strchr(cmdname, '=') != NULL) { verify(find_command_idx("set", &i) == 0); current_command = &command_table[i]; ret = command_table[i].func(argc, newargv); } else { (void) fprintf(stderr, gettext("unrecognized " "command '%s'\n"), cmdname); usage(B_FALSE); ret = 1; } for (i = 0; i < argc; i++) free(newargv[i]); free(newargv); if (ret == 0 && log_history) (void) zpool_log_history(g_zfs, history_str); libzfs_fini(g_zfs); /* * The 'ZFS_ABORT' environment variable causes us to dump core on exit * for the purposes of running ::findleaks. */ if (getenv("ZFS_ABORT") != NULL) { (void) printf("dumping core by request\n"); abort(); } return (ret); } /* * zfs zone nsfile filesystem * * Add or delete the given dataset to/from the namespace. */ #ifdef __linux__ static int zfs_do_zone_impl(int argc, char **argv, boolean_t attach) { zfs_handle_t *zhp; int ret; if (argc < 3) { (void) fprintf(stderr, gettext("missing argument(s)\n")); usage(B_FALSE); } if (argc > 3) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } zhp = zfs_open(g_zfs, argv[2], ZFS_TYPE_FILESYSTEM); if (zhp == NULL) return (1); ret = (zfs_userns(zhp, argv[1], attach) != 0); zfs_close(zhp); return (ret); } static int zfs_do_zone(int argc, char **argv) { return (zfs_do_zone_impl(argc, argv, B_TRUE)); } static int zfs_do_unzone(int argc, char **argv) { return (zfs_do_zone_impl(argc, argv, B_FALSE)); } #endif #ifdef __FreeBSD__ #include #include /* * Attach/detach the given dataset to/from the given jail */ static int zfs_do_jail_impl(int argc, char **argv, boolean_t attach) { zfs_handle_t *zhp; int jailid, ret; /* check number of arguments */ if (argc < 3) { (void) fprintf(stderr, gettext("missing argument(s)\n")); usage(B_FALSE); } if (argc > 3) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } jailid = jail_getid(argv[1]); if (jailid < 0) { (void) fprintf(stderr, gettext("invalid jail id or name\n")); usage(B_FALSE); } zhp = zfs_open(g_zfs, argv[2], ZFS_TYPE_FILESYSTEM); if (zhp == NULL) return (1); ret = (zfs_jail(zhp, jailid, attach) != 0); zfs_close(zhp); return (ret); } /* * zfs jail jailid filesystem * * Attach the given dataset to the given jail */ static int zfs_do_jail(int argc, char **argv) { return (zfs_do_jail_impl(argc, argv, B_TRUE)); } /* * zfs unjail jailid filesystem * * Detach the given dataset from the given jail */ static int zfs_do_unjail(int argc, char **argv) { return (zfs_do_jail_impl(argc, argv, B_FALSE)); } #endif diff --git a/lib/libefi/rdwr_efi.c b/lib/libefi/rdwr_efi.c index f159a022496c..3501c3ea391c 100644 --- a/lib/libefi/rdwr_efi.c +++ b/lib/libefi/rdwr_efi.c @@ -1,1629 +1,1627 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2012 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2018 by Delphix. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static struct uuid_to_ptag { struct uuid uuid; } conversion_array[] = { { EFI_UNUSED }, { EFI_BOOT }, { EFI_ROOT }, { EFI_SWAP }, { EFI_USR }, { EFI_BACKUP }, { EFI_UNUSED }, /* STAND is never used */ { EFI_VAR }, { EFI_HOME }, { EFI_ALTSCTR }, { EFI_UNUSED }, /* CACHE (cachefs) is never used */ { EFI_RESERVED }, { EFI_SYSTEM }, { EFI_LEGACY_MBR }, { EFI_SYMC_PUB }, { EFI_SYMC_CDS }, { EFI_MSFT_RESV }, { EFI_DELL_BASIC }, { EFI_DELL_RAID }, { EFI_DELL_SWAP }, { EFI_DELL_LVM }, { EFI_DELL_RESV }, { EFI_AAPL_HFS }, { EFI_AAPL_UFS }, { EFI_FREEBSD_BOOT }, { EFI_FREEBSD_SWAP }, { EFI_FREEBSD_UFS }, { EFI_FREEBSD_VINUM }, { EFI_FREEBSD_ZFS }, { EFI_BIOS_BOOT }, { EFI_INTC_RS }, { EFI_SNE_BOOT }, { EFI_LENOVO_BOOT }, { EFI_MSFT_LDMM }, { EFI_MSFT_LDMD }, { EFI_MSFT_RE }, { EFI_IBM_GPFS }, { EFI_MSFT_STORAGESPACES }, { EFI_HPQ_DATA }, { EFI_HPQ_SVC }, { EFI_RHT_DATA }, { EFI_RHT_HOME }, { EFI_RHT_SRV }, { EFI_RHT_DMCRYPT }, { EFI_RHT_LUKS }, { EFI_FREEBSD_DISKLABEL }, { EFI_AAPL_RAID }, { EFI_AAPL_RAIDOFFLINE }, { EFI_AAPL_BOOT }, { EFI_AAPL_LABEL }, { EFI_AAPL_TVRECOVERY }, { EFI_AAPL_CORESTORAGE }, { EFI_NETBSD_SWAP }, { EFI_NETBSD_FFS }, { EFI_NETBSD_LFS }, { EFI_NETBSD_RAID }, { EFI_NETBSD_CAT }, { EFI_NETBSD_CRYPT }, { EFI_GOOG_KERN }, { EFI_GOOG_ROOT }, { EFI_GOOG_RESV }, { EFI_HAIKU_BFS }, { EFI_MIDNIGHTBSD_BOOT }, { EFI_MIDNIGHTBSD_DATA }, { EFI_MIDNIGHTBSD_SWAP }, { EFI_MIDNIGHTBSD_UFS }, { EFI_MIDNIGHTBSD_VINUM }, { EFI_MIDNIGHTBSD_ZFS }, { EFI_CEPH_JOURNAL }, { EFI_CEPH_DMCRYPTJOURNAL }, { EFI_CEPH_OSD }, { EFI_CEPH_DMCRYPTOSD }, { EFI_CEPH_CREATE }, { EFI_CEPH_DMCRYPTCREATE }, { EFI_OPENBSD_DISKLABEL }, { EFI_BBRY_QNX }, { EFI_BELL_PLAN9 }, { EFI_VMW_KCORE }, { EFI_VMW_VMFS }, { EFI_VMW_RESV }, { EFI_RHT_ROOTX86 }, { EFI_RHT_ROOTAMD64 }, { EFI_RHT_ROOTARM }, { EFI_RHT_ROOTARM64 }, { EFI_ACRONIS_SECUREZONE }, { EFI_ONIE_BOOT }, { EFI_ONIE_CONFIG }, { EFI_IBM_PPRPBOOT }, { EFI_FREEDESKTOP_BOOT } }; int efi_debug = 0; static int efi_read(int, struct dk_gpt *); /* * Return a 32-bit CRC of the contents of the buffer. Pre-and-post * one's conditioning will be handled by crc32() internally. */ static uint32_t efi_crc32(const unsigned char *buf, unsigned int size) { uint32_t crc = crc32(0, Z_NULL, 0); crc = crc32(crc, buf, size); return (crc); } static int read_disk_info(int fd, diskaddr_t *capacity, uint_t *lbsize) { int sector_size; unsigned long long capacity_size; if (ioctl(fd, BLKSSZGET, §or_size) < 0) return (-1); if (ioctl(fd, BLKGETSIZE64, &capacity_size) < 0) return (-1); *lbsize = (uint_t)sector_size; *capacity = (diskaddr_t)(capacity_size / sector_size); return (0); } /* * Return back the device name associated with the file descriptor. The * caller is responsible for freeing the memory associated with the * returned string. */ static char * efi_get_devname(int fd) { char path[32]; /* * The libefi API only provides the open fd and not the file path. * To handle this realpath(3) is used to resolve the block device * name from /proc/self/fd/. */ (void) snprintf(path, sizeof (path), "/proc/self/fd/%d", fd); return (realpath(path, NULL)); } static int efi_get_info(int fd, struct dk_cinfo *dki_info) { char *dev_path; int rval = 0; memset(dki_info, 0, sizeof (*dki_info)); /* * The simplest way to get the partition number under linux is * to parse it out of the /dev/ block device name. * The kernel creates this using the partition number when it * populates /dev/ so it may be trusted. The tricky bit here is * that the naming convention is based on the block device type. * So we need to take this in to account when parsing out the * partition information. Aside from the partition number we collect * some additional device info. */ dev_path = efi_get_devname(fd); if (dev_path == NULL) goto error; if ((strncmp(dev_path, "/dev/sd", 7) == 0)) { strcpy(dki_info->dki_cname, "sd"); dki_info->dki_ctype = DKC_SCSI_CCS; rval = sscanf(dev_path, "/dev/%[a-zA-Z]%hu", dki_info->dki_dname, &dki_info->dki_partition); } else if ((strncmp(dev_path, "/dev/hd", 7) == 0)) { strcpy(dki_info->dki_cname, "hd"); dki_info->dki_ctype = DKC_DIRECT; rval = sscanf(dev_path, "/dev/%[a-zA-Z]%hu", dki_info->dki_dname, &dki_info->dki_partition); } else if ((strncmp(dev_path, "/dev/md", 7) == 0)) { strcpy(dki_info->dki_cname, "pseudo"); dki_info->dki_ctype = DKC_MD; strcpy(dki_info->dki_dname, "md"); rval = sscanf(dev_path, "/dev/md%[0-9]p%hu", dki_info->dki_dname + 2, &dki_info->dki_partition); } else if ((strncmp(dev_path, "/dev/vd", 7) == 0)) { strcpy(dki_info->dki_cname, "vd"); dki_info->dki_ctype = DKC_MD; rval = sscanf(dev_path, "/dev/%[a-zA-Z]%hu", dki_info->dki_dname, &dki_info->dki_partition); } else if ((strncmp(dev_path, "/dev/xvd", 8) == 0)) { strcpy(dki_info->dki_cname, "xvd"); dki_info->dki_ctype = DKC_MD; rval = sscanf(dev_path, "/dev/%[a-zA-Z]%hu", dki_info->dki_dname, &dki_info->dki_partition); } else if ((strncmp(dev_path, "/dev/zd", 7) == 0)) { strcpy(dki_info->dki_cname, "zd"); dki_info->dki_ctype = DKC_MD; strcpy(dki_info->dki_dname, "zd"); rval = sscanf(dev_path, "/dev/zd%[0-9]p%hu", dki_info->dki_dname + 2, &dki_info->dki_partition); } else if ((strncmp(dev_path, "/dev/dm-", 8) == 0)) { strcpy(dki_info->dki_cname, "pseudo"); dki_info->dki_ctype = DKC_VBD; strcpy(dki_info->dki_dname, "dm-"); rval = sscanf(dev_path, "/dev/dm-%[0-9]p%hu", dki_info->dki_dname + 3, &dki_info->dki_partition); } else if ((strncmp(dev_path, "/dev/ram", 8) == 0)) { strcpy(dki_info->dki_cname, "pseudo"); dki_info->dki_ctype = DKC_PCMCIA_MEM; strcpy(dki_info->dki_dname, "ram"); rval = sscanf(dev_path, "/dev/ram%[0-9]p%hu", dki_info->dki_dname + 3, &dki_info->dki_partition); } else if ((strncmp(dev_path, "/dev/loop", 9) == 0)) { strcpy(dki_info->dki_cname, "pseudo"); dki_info->dki_ctype = DKC_VBD; strcpy(dki_info->dki_dname, "loop"); rval = sscanf(dev_path, "/dev/loop%[0-9]p%hu", dki_info->dki_dname + 4, &dki_info->dki_partition); } else if ((strncmp(dev_path, "/dev/nvme", 9) == 0)) { strcpy(dki_info->dki_cname, "nvme"); dki_info->dki_ctype = DKC_SCSI_CCS; strcpy(dki_info->dki_dname, "nvme"); (void) sscanf(dev_path, "/dev/nvme%[0-9]", dki_info->dki_dname + 4); size_t controller_length = strlen( dki_info->dki_dname); strcpy(dki_info->dki_dname + controller_length, "n"); rval = sscanf(dev_path, "/dev/nvme%*[0-9]n%[0-9]p%hu", dki_info->dki_dname + controller_length + 1, &dki_info->dki_partition); } else { strcpy(dki_info->dki_dname, "unknown"); strcpy(dki_info->dki_cname, "unknown"); dki_info->dki_ctype = DKC_UNKNOWN; } switch (rval) { case 0: errno = EINVAL; goto error; case 1: dki_info->dki_partition = 0; } free(dev_path); return (0); error: if (efi_debug) (void) fprintf(stderr, "DKIOCINFO errno 0x%x\n", errno); switch (errno) { case EIO: return (VT_EIO); case EINVAL: return (VT_EINVAL); default: return (VT_ERROR); } } /* * the number of blocks the EFI label takes up (round up to nearest * block) */ #define NBLOCKS(p, l) (1 + ((((p) * (int)sizeof (efi_gpe_t)) + \ ((l) - 1)) / (l))) /* number of partitions -- limited by what we can malloc */ #define MAX_PARTS ((4294967295UL - sizeof (struct dk_gpt)) / \ sizeof (struct dk_part)) int efi_alloc_and_init(int fd, uint32_t nparts, struct dk_gpt **vtoc) { diskaddr_t capacity = 0; uint_t lbsize = 0; uint_t nblocks; size_t length; struct dk_gpt *vptr; struct uuid uuid; struct dk_cinfo dki_info; if (read_disk_info(fd, &capacity, &lbsize) != 0) return (-1); if (efi_get_info(fd, &dki_info) != 0) return (-1); if (dki_info.dki_partition != 0) return (-1); if ((dki_info.dki_ctype == DKC_PCMCIA_MEM) || (dki_info.dki_ctype == DKC_VBD) || (dki_info.dki_ctype == DKC_UNKNOWN)) return (-1); nblocks = NBLOCKS(nparts, lbsize); if ((nblocks * lbsize) < EFI_MIN_ARRAY_SIZE + lbsize) { /* 16K plus one block for the GPT */ nblocks = EFI_MIN_ARRAY_SIZE / lbsize + 1; } if (nparts > MAX_PARTS) { if (efi_debug) { (void) fprintf(stderr, "the maximum number of partitions supported is %lu\n", MAX_PARTS); } return (-1); } length = sizeof (struct dk_gpt) + sizeof (struct dk_part) * (nparts - 1); vptr = calloc(1, length); if (vptr == NULL) return (-1); *vtoc = vptr; vptr->efi_version = EFI_VERSION_CURRENT; vptr->efi_lbasize = lbsize; vptr->efi_nparts = nparts; /* * add one block here for the PMBR; on disks with a 512 byte * block size and 128 or fewer partitions, efi_first_u_lba * should work out to "34" */ vptr->efi_first_u_lba = nblocks + 1; vptr->efi_last_lba = capacity - 1; vptr->efi_altern_lba = capacity -1; vptr->efi_last_u_lba = vptr->efi_last_lba - nblocks; (void) uuid_generate((uchar_t *)&uuid); UUID_LE_CONVERT(vptr->efi_disk_uguid, uuid); return (0); } /* * Read EFI - return partition number upon success. */ int efi_alloc_and_read(int fd, struct dk_gpt **vtoc) { int rval; uint32_t nparts; int length; struct dk_gpt *vptr; /* figure out the number of entries that would fit into 16K */ nparts = EFI_MIN_ARRAY_SIZE / sizeof (efi_gpe_t); length = (int) sizeof (struct dk_gpt) + (int) sizeof (struct dk_part) * (nparts - 1); vptr = calloc(1, length); if (vptr == NULL) return (VT_ERROR); vptr->efi_nparts = nparts; rval = efi_read(fd, vptr); if ((rval == VT_EINVAL) && vptr->efi_nparts > nparts) { void *tmp; length = (int) sizeof (struct dk_gpt) + (int) sizeof (struct dk_part) * (vptr->efi_nparts - 1); - nparts = vptr->efi_nparts; if ((tmp = realloc(vptr, length)) == NULL) { /* cppcheck-suppress doubleFree */ free(vptr); *vtoc = NULL; return (VT_ERROR); } else { vptr = tmp; rval = efi_read(fd, vptr); } } if (rval < 0) { if (efi_debug) { (void) fprintf(stderr, "read of EFI table failed, rval=%d\n", rval); } free(vptr); *vtoc = NULL; } else { *vtoc = vptr; } return (rval); } static int efi_ioctl(int fd, int cmd, dk_efi_t *dk_ioc) { void *data = dk_ioc->dki_data; int error; diskaddr_t capacity; uint_t lbsize; /* * When the IO is not being performed in kernel as an ioctl we need * to know the sector size so we can seek to the proper byte offset. */ if (read_disk_info(fd, &capacity, &lbsize) == -1) { if (efi_debug) fprintf(stderr, "unable to read disk info: %d", errno); errno = EIO; return (-1); } switch (cmd) { case DKIOCGETEFI: if (lbsize == 0) { if (efi_debug) (void) fprintf(stderr, "DKIOCGETEFI assuming " "LBA %d bytes\n", DEV_BSIZE); lbsize = DEV_BSIZE; } error = lseek(fd, dk_ioc->dki_lba * lbsize, SEEK_SET); if (error == -1) { if (efi_debug) (void) fprintf(stderr, "DKIOCGETEFI lseek " "error: %d\n", errno); return (error); } error = read(fd, data, dk_ioc->dki_length); if (error == -1) { if (efi_debug) (void) fprintf(stderr, "DKIOCGETEFI read " "error: %d\n", errno); return (error); } if (error != dk_ioc->dki_length) { if (efi_debug) (void) fprintf(stderr, "DKIOCGETEFI short " "read of %d bytes\n", error); errno = EIO; return (-1); } error = 0; break; case DKIOCSETEFI: if (lbsize == 0) { if (efi_debug) (void) fprintf(stderr, "DKIOCSETEFI unknown " "LBA size\n"); errno = EIO; return (-1); } error = lseek(fd, dk_ioc->dki_lba * lbsize, SEEK_SET); if (error == -1) { if (efi_debug) (void) fprintf(stderr, "DKIOCSETEFI lseek " "error: %d\n", errno); return (error); } error = write(fd, data, dk_ioc->dki_length); if (error == -1) { if (efi_debug) (void) fprintf(stderr, "DKIOCSETEFI write " "error: %d\n", errno); return (error); } if (error != dk_ioc->dki_length) { if (efi_debug) (void) fprintf(stderr, "DKIOCSETEFI short " "write of %d bytes\n", error); errno = EIO; return (-1); } /* Sync the new EFI table to disk */ error = fsync(fd); if (error == -1) return (error); /* Ensure any local disk cache is also flushed */ if (ioctl(fd, BLKFLSBUF, 0) == -1) return (error); error = 0; break; default: if (efi_debug) (void) fprintf(stderr, "unsupported ioctl()\n"); errno = EIO; return (-1); } return (error); } int efi_rescan(int fd) { int retry = 10; - int error; /* Notify the kernel a devices partition table has been updated */ - while ((error = ioctl(fd, BLKRRPART)) != 0) { + while (ioctl(fd, BLKRRPART) != 0) { if ((--retry == 0) || (errno != EBUSY)) { (void) fprintf(stderr, "the kernel failed to rescan " "the partition table: %d\n", errno); return (-1); } usleep(50000); } return (0); } static int check_label(int fd, dk_efi_t *dk_ioc) { efi_gpt_t *efi; uint_t crc; if (efi_ioctl(fd, DKIOCGETEFI, dk_ioc) == -1) { switch (errno) { case EIO: return (VT_EIO); default: return (VT_ERROR); } } efi = dk_ioc->dki_data; if (efi->efi_gpt_Signature != LE_64(EFI_SIGNATURE)) { if (efi_debug) (void) fprintf(stderr, "Bad EFI signature: 0x%llx != 0x%llx\n", (long long)efi->efi_gpt_Signature, (long long)LE_64(EFI_SIGNATURE)); return (VT_EINVAL); } /* * check CRC of the header; the size of the header should * never be larger than one block */ crc = efi->efi_gpt_HeaderCRC32; efi->efi_gpt_HeaderCRC32 = 0; len_t headerSize = (len_t)LE_32(efi->efi_gpt_HeaderSize); if (headerSize < EFI_MIN_LABEL_SIZE || headerSize > EFI_LABEL_SIZE) { if (efi_debug) (void) fprintf(stderr, "Invalid EFI HeaderSize %llu. Assuming %d.\n", headerSize, EFI_MIN_LABEL_SIZE); } if ((headerSize > dk_ioc->dki_length) || crc != LE_32(efi_crc32((unsigned char *)efi, headerSize))) { if (efi_debug) (void) fprintf(stderr, "Bad EFI CRC: 0x%x != 0x%x\n", crc, LE_32(efi_crc32((unsigned char *)efi, headerSize))); return (VT_EINVAL); } return (0); } static int efi_read(int fd, struct dk_gpt *vtoc) { int i, j; int label_len; int rval = 0; int md_flag = 0; int vdc_flag = 0; diskaddr_t capacity = 0; uint_t lbsize = 0; struct dk_minfo disk_info; dk_efi_t dk_ioc; efi_gpt_t *efi; efi_gpe_t *efi_parts; struct dk_cinfo dki_info; uint32_t user_length; boolean_t legacy_label = B_FALSE; /* * get the partition number for this file descriptor. */ if ((rval = efi_get_info(fd, &dki_info)) != 0) return (rval); if ((strncmp(dki_info.dki_cname, "pseudo", 7) == 0) && (strncmp(dki_info.dki_dname, "md", 3) == 0)) { md_flag++; } else if ((strncmp(dki_info.dki_cname, "vdc", 4) == 0) && (strncmp(dki_info.dki_dname, "vdc", 4) == 0)) { /* * The controller and drive name "vdc" (virtual disk client) * indicates a LDoms virtual disk. */ vdc_flag++; } /* get the LBA size */ if (read_disk_info(fd, &capacity, &lbsize) == -1) { if (efi_debug) { (void) fprintf(stderr, "unable to read disk info: %d", errno); } return (VT_EINVAL); } disk_info.dki_lbsize = lbsize; disk_info.dki_capacity = capacity; if (disk_info.dki_lbsize == 0) { if (efi_debug) { (void) fprintf(stderr, "efi_read: assuming LBA 512 bytes\n"); } disk_info.dki_lbsize = DEV_BSIZE; } /* * Read the EFI GPT to figure out how many partitions we need * to deal with. */ dk_ioc.dki_lba = 1; if (NBLOCKS(vtoc->efi_nparts, disk_info.dki_lbsize) < 34) { label_len = EFI_MIN_ARRAY_SIZE + disk_info.dki_lbsize; } else { label_len = vtoc->efi_nparts * (int) sizeof (efi_gpe_t) + disk_info.dki_lbsize; if (label_len % disk_info.dki_lbsize) { /* pad to physical sector size */ label_len += disk_info.dki_lbsize; label_len &= ~(disk_info.dki_lbsize - 1); } } if (posix_memalign((void **)&dk_ioc.dki_data, disk_info.dki_lbsize, label_len)) return (VT_ERROR); memset(dk_ioc.dki_data, 0, label_len); dk_ioc.dki_length = disk_info.dki_lbsize; user_length = vtoc->efi_nparts; efi = dk_ioc.dki_data; if (md_flag) { dk_ioc.dki_length = label_len; if (efi_ioctl(fd, DKIOCGETEFI, &dk_ioc) == -1) { switch (errno) { case EIO: return (VT_EIO); default: return (VT_ERROR); } } } else if ((rval = check_label(fd, &dk_ioc)) == VT_EINVAL) { /* * No valid label here; try the alternate. Note that here * we just read GPT header and save it into dk_ioc.data, * Later, we will read GUID partition entry array if we * can get valid GPT header. */ /* * This is a workaround for legacy systems. In the past, the * last sector of SCSI disk was invisible on x86 platform. At * that time, backup label was saved on the next to the last * sector. It is possible for users to move a disk from previous * solaris system to present system. Here, we attempt to search * legacy backup EFI label first. */ dk_ioc.dki_lba = disk_info.dki_capacity - 2; dk_ioc.dki_length = disk_info.dki_lbsize; rval = check_label(fd, &dk_ioc); if (rval == VT_EINVAL) { /* * we didn't find legacy backup EFI label, try to * search backup EFI label in the last block. */ dk_ioc.dki_lba = disk_info.dki_capacity - 1; dk_ioc.dki_length = disk_info.dki_lbsize; rval = check_label(fd, &dk_ioc); if (rval == 0) { legacy_label = B_TRUE; if (efi_debug) (void) fprintf(stderr, "efi_read: primary label corrupt; " "using EFI backup label located on" " the last block\n"); } } else { if ((efi_debug) && (rval == 0)) (void) fprintf(stderr, "efi_read: primary label" " corrupt; using legacy EFI backup label " " located on the next to last block\n"); } if (rval == 0) { dk_ioc.dki_lba = LE_64(efi->efi_gpt_PartitionEntryLBA); vtoc->efi_flags |= EFI_GPT_PRIMARY_CORRUPT; vtoc->efi_nparts = LE_32(efi->efi_gpt_NumberOfPartitionEntries); /* * Partition tables are between backup GPT header * table and ParitionEntryLBA (the starting LBA of * the GUID partition entries array). Now that we * already got valid GPT header and saved it in * dk_ioc.dki_data, we try to get GUID partition * entry array here. */ /* LINTED */ dk_ioc.dki_data = (efi_gpt_t *)((char *)dk_ioc.dki_data + disk_info.dki_lbsize); if (legacy_label) dk_ioc.dki_length = disk_info.dki_capacity - 1 - dk_ioc.dki_lba; else dk_ioc.dki_length = disk_info.dki_capacity - 2 - dk_ioc.dki_lba; dk_ioc.dki_length *= disk_info.dki_lbsize; if (dk_ioc.dki_length > ((len_t)label_len - sizeof (*dk_ioc.dki_data))) { rval = VT_EINVAL; } else { /* * read GUID partition entry array */ rval = efi_ioctl(fd, DKIOCGETEFI, &dk_ioc); } } } else if (rval == 0) { dk_ioc.dki_lba = LE_64(efi->efi_gpt_PartitionEntryLBA); /* LINTED */ dk_ioc.dki_data = (efi_gpt_t *)((char *)dk_ioc.dki_data + disk_info.dki_lbsize); dk_ioc.dki_length = label_len - disk_info.dki_lbsize; rval = efi_ioctl(fd, DKIOCGETEFI, &dk_ioc); } else if (vdc_flag && rval == VT_ERROR && errno == EINVAL) { /* * When the device is a LDoms virtual disk, the DKIOCGETEFI * ioctl can fail with EINVAL if the virtual disk backend * is a ZFS volume serviced by a domain running an old version * of Solaris. This is because the DKIOCGETEFI ioctl was * initially incorrectly implemented for a ZFS volume and it * expected the GPT and GPE to be retrieved with a single ioctl. * So we try to read the GPT and the GPE using that old style * ioctl. */ dk_ioc.dki_lba = 1; dk_ioc.dki_length = label_len; rval = check_label(fd, &dk_ioc); } if (rval < 0) { free(efi); return (rval); } /* LINTED -- always longlong aligned */ efi_parts = (efi_gpe_t *)(((char *)efi) + disk_info.dki_lbsize); /* * Assemble this into a "dk_gpt" struct for easier * digestibility by applications. */ vtoc->efi_version = LE_32(efi->efi_gpt_Revision); vtoc->efi_nparts = LE_32(efi->efi_gpt_NumberOfPartitionEntries); vtoc->efi_part_size = LE_32(efi->efi_gpt_SizeOfPartitionEntry); vtoc->efi_lbasize = disk_info.dki_lbsize; vtoc->efi_last_lba = disk_info.dki_capacity - 1; vtoc->efi_first_u_lba = LE_64(efi->efi_gpt_FirstUsableLBA); vtoc->efi_last_u_lba = LE_64(efi->efi_gpt_LastUsableLBA); vtoc->efi_altern_lba = LE_64(efi->efi_gpt_AlternateLBA); UUID_LE_CONVERT(vtoc->efi_disk_uguid, efi->efi_gpt_DiskGUID); /* * If the array the user passed in is too small, set the length * to what it needs to be and return */ if (user_length < vtoc->efi_nparts) { return (VT_EINVAL); } for (i = 0; i < vtoc->efi_nparts; i++) { UUID_LE_CONVERT(vtoc->efi_parts[i].p_guid, efi_parts[i].efi_gpe_PartitionTypeGUID); for (j = 0; j < sizeof (conversion_array) / sizeof (struct uuid_to_ptag); j++) { if (memcmp(&vtoc->efi_parts[i].p_guid, &conversion_array[j].uuid, sizeof (struct uuid)) == 0) { vtoc->efi_parts[i].p_tag = j; break; } } if (vtoc->efi_parts[i].p_tag == V_UNASSIGNED) continue; vtoc->efi_parts[i].p_flag = LE_16(efi_parts[i].efi_gpe_Attributes.PartitionAttrs); vtoc->efi_parts[i].p_start = LE_64(efi_parts[i].efi_gpe_StartingLBA); vtoc->efi_parts[i].p_size = LE_64(efi_parts[i].efi_gpe_EndingLBA) - vtoc->efi_parts[i].p_start + 1; for (j = 0; j < EFI_PART_NAME_LEN; j++) { vtoc->efi_parts[i].p_name[j] = (uchar_t)LE_16( efi_parts[i].efi_gpe_PartitionName[j]); } UUID_LE_CONVERT(vtoc->efi_parts[i].p_uguid, efi_parts[i].efi_gpe_UniquePartitionGUID); } free(efi); return (dki_info.dki_partition); } /* writes a "protective" MBR */ static int write_pmbr(int fd, struct dk_gpt *vtoc) { dk_efi_t dk_ioc; struct mboot mb; uchar_t *cp; diskaddr_t size_in_lba; uchar_t *buf; int len; len = (vtoc->efi_lbasize == 0) ? sizeof (mb) : vtoc->efi_lbasize; if (posix_memalign((void **)&buf, len, len)) return (VT_ERROR); /* * Preserve any boot code and disk signature if the first block is * already an MBR. */ memset(buf, 0, len); dk_ioc.dki_lba = 0; dk_ioc.dki_length = len; /* LINTED -- always longlong aligned */ dk_ioc.dki_data = (efi_gpt_t *)buf; if (efi_ioctl(fd, DKIOCGETEFI, &dk_ioc) == -1) { memset(&mb, 0, sizeof (mb)); mb.signature = LE_16(MBB_MAGIC); } else { (void) memcpy(&mb, buf, sizeof (mb)); if (mb.signature != LE_16(MBB_MAGIC)) { memset(&mb, 0, sizeof (mb)); mb.signature = LE_16(MBB_MAGIC); } } memset(&mb.parts, 0, sizeof (mb.parts)); cp = (uchar_t *)&mb.parts[0]; /* bootable or not */ *cp++ = 0; /* beginning CHS; 0xffffff if not representable */ *cp++ = 0xff; *cp++ = 0xff; *cp++ = 0xff; /* OS type */ *cp++ = EFI_PMBR; /* ending CHS; 0xffffff if not representable */ *cp++ = 0xff; *cp++ = 0xff; *cp++ = 0xff; /* starting LBA: 1 (little endian format) by EFI definition */ *cp++ = 0x01; *cp++ = 0x00; *cp++ = 0x00; *cp++ = 0x00; /* ending LBA: last block on the disk (little endian format) */ size_in_lba = vtoc->efi_last_lba; if (size_in_lba < 0xffffffff) { *cp++ = (size_in_lba & 0x000000ff); *cp++ = (size_in_lba & 0x0000ff00) >> 8; *cp++ = (size_in_lba & 0x00ff0000) >> 16; *cp++ = (size_in_lba & 0xff000000) >> 24; } else { *cp++ = 0xff; *cp++ = 0xff; *cp++ = 0xff; *cp++ = 0xff; } (void) memcpy(buf, &mb, sizeof (mb)); /* LINTED -- always longlong aligned */ dk_ioc.dki_data = (efi_gpt_t *)buf; dk_ioc.dki_lba = 0; dk_ioc.dki_length = len; if (efi_ioctl(fd, DKIOCSETEFI, &dk_ioc) == -1) { free(buf); switch (errno) { case EIO: return (VT_EIO); case EINVAL: return (VT_EINVAL); default: return (VT_ERROR); } } free(buf); return (0); } /* make sure the user specified something reasonable */ static int check_input(struct dk_gpt *vtoc) { int resv_part = -1; int i, j; diskaddr_t istart, jstart, isize, jsize, endsect; /* * Sanity-check the input (make sure no partitions overlap) */ for (i = 0; i < vtoc->efi_nparts; i++) { /* It can't be unassigned and have an actual size */ if ((vtoc->efi_parts[i].p_tag == V_UNASSIGNED) && (vtoc->efi_parts[i].p_size != 0)) { if (efi_debug) { (void) fprintf(stderr, "partition %d is " "\"unassigned\" but has a size of %llu", i, vtoc->efi_parts[i].p_size); } return (VT_EINVAL); } if (vtoc->efi_parts[i].p_tag == V_UNASSIGNED) { if (uuid_is_null((uchar_t *)&vtoc->efi_parts[i].p_guid)) continue; /* we have encountered an unknown uuid */ vtoc->efi_parts[i].p_tag = 0xff; } if (vtoc->efi_parts[i].p_tag == V_RESERVED) { if (resv_part != -1) { if (efi_debug) { (void) fprintf(stderr, "found " "duplicate reserved partition " "at %d\n", i); } return (VT_EINVAL); } resv_part = i; } if ((vtoc->efi_parts[i].p_start < vtoc->efi_first_u_lba) || (vtoc->efi_parts[i].p_start > vtoc->efi_last_u_lba)) { if (efi_debug) { (void) fprintf(stderr, "Partition %d starts at %llu. ", i, vtoc->efi_parts[i].p_start); (void) fprintf(stderr, "It must be between %llu and %llu.\n", vtoc->efi_first_u_lba, vtoc->efi_last_u_lba); } return (VT_EINVAL); } if ((vtoc->efi_parts[i].p_start + vtoc->efi_parts[i].p_size < vtoc->efi_first_u_lba) || (vtoc->efi_parts[i].p_start + vtoc->efi_parts[i].p_size > vtoc->efi_last_u_lba + 1)) { if (efi_debug) { (void) fprintf(stderr, "Partition %d ends at %llu. ", i, vtoc->efi_parts[i].p_start + vtoc->efi_parts[i].p_size); (void) fprintf(stderr, "It must be between %llu and %llu.\n", vtoc->efi_first_u_lba, vtoc->efi_last_u_lba); } return (VT_EINVAL); } for (j = 0; j < vtoc->efi_nparts; j++) { isize = vtoc->efi_parts[i].p_size; jsize = vtoc->efi_parts[j].p_size; istart = vtoc->efi_parts[i].p_start; jstart = vtoc->efi_parts[j].p_start; if ((i != j) && (isize != 0) && (jsize != 0)) { endsect = jstart + jsize -1; if ((jstart <= istart) && (istart <= endsect)) { if (efi_debug) { (void) fprintf(stderr, "Partition %d overlaps " "partition %d.", i, j); } return (VT_EINVAL); } } } } /* just a warning for now */ if ((resv_part == -1) && efi_debug) { (void) fprintf(stderr, "no reserved partition found\n"); } return (0); } static int call_blkpg_ioctl(int fd, int command, diskaddr_t start, diskaddr_t size, uint_t pno) { struct blkpg_ioctl_arg ioctl_arg; struct blkpg_partition linux_part; memset(&linux_part, 0, sizeof (linux_part)); char *path = efi_get_devname(fd); if (path == NULL) { (void) fprintf(stderr, "failed to retrieve device name\n"); return (VT_EINVAL); } linux_part.start = start; linux_part.length = size; linux_part.pno = pno; snprintf(linux_part.devname, BLKPG_DEVNAMELTH - 1, "%s%u", path, pno); linux_part.devname[BLKPG_DEVNAMELTH - 1] = '\0'; free(path); ioctl_arg.op = command; ioctl_arg.flags = 0; ioctl_arg.datalen = sizeof (struct blkpg_partition); ioctl_arg.data = &linux_part; return (ioctl(fd, BLKPG, &ioctl_arg)); } /* * add all the unallocated space to the current label */ int efi_use_whole_disk(int fd) { struct dk_gpt *efi_label = NULL; int rval; int i; uint_t resv_index = 0, data_index = 0; diskaddr_t resv_start = 0, data_start = 0; diskaddr_t data_size, limit, difference; boolean_t sync_needed = B_FALSE; uint_t nblocks; rval = efi_alloc_and_read(fd, &efi_label); if (rval < 0) { if (efi_label != NULL) efi_free(efi_label); return (rval); } /* * Find the last physically non-zero partition. * This should be the reserved partition. */ for (i = 0; i < efi_label->efi_nparts; i ++) { if (resv_start < efi_label->efi_parts[i].p_start) { resv_start = efi_label->efi_parts[i].p_start; resv_index = i; } } /* * Find the last physically non-zero partition before that. * This is the data partition. */ for (i = 0; i < resv_index; i ++) { if (data_start < efi_label->efi_parts[i].p_start) { data_start = efi_label->efi_parts[i].p_start; data_index = i; } } data_size = efi_label->efi_parts[data_index].p_size; /* * See the "efi_alloc_and_init" function for more information * about where this "nblocks" value comes from. */ nblocks = efi_label->efi_first_u_lba - 1; /* * Determine if the EFI label is out of sync. We check that: * * 1. the data partition ends at the limit we set, and * 2. the reserved partition starts at the limit we set. * * If either of these conditions is not met, then we need to * resync the EFI label. * * The limit is the last usable LBA, determined by the last LBA * and the first usable LBA fields on the EFI label of the disk * (see the lines directly above). Additionally, we factor in * EFI_MIN_RESV_SIZE (per its use in "zpool_label_disk") and * P2ALIGN it to ensure the partition boundaries are aligned * (for performance reasons). The alignment should match the * alignment used by the "zpool_label_disk" function. */ limit = P2ALIGN(efi_label->efi_last_lba - nblocks - EFI_MIN_RESV_SIZE, PARTITION_END_ALIGNMENT); if (data_start + data_size != limit || resv_start != limit) sync_needed = B_TRUE; if (efi_debug && sync_needed) (void) fprintf(stderr, "efi_use_whole_disk: sync needed\n"); /* * If alter_lba is 1, we are using the backup label. * Since we can locate the backup label by disk capacity, * there must be no unallocated space. */ if ((efi_label->efi_altern_lba == 1) || (efi_label->efi_altern_lba >= efi_label->efi_last_lba && !sync_needed)) { if (efi_debug) { (void) fprintf(stderr, "efi_use_whole_disk: requested space not found\n"); } efi_free(efi_label); return (VT_ENOSPC); } /* * Verify that we've found the reserved partition by checking * that it looks the way it did when we created it in zpool_label_disk. * If we've found the incorrect partition, then we know that this * device was reformatted and no longer is solely used by ZFS. */ if ((efi_label->efi_parts[resv_index].p_size != EFI_MIN_RESV_SIZE) || (efi_label->efi_parts[resv_index].p_tag != V_RESERVED) || (resv_index != 8)) { if (efi_debug) { (void) fprintf(stderr, "efi_use_whole_disk: wholedisk not available\n"); } efi_free(efi_label); return (VT_ENOSPC); } if (data_start + data_size != resv_start) { if (efi_debug) { (void) fprintf(stderr, "efi_use_whole_disk: " "data_start (%lli) + " "data_size (%lli) != " "resv_start (%lli)\n", data_start, data_size, resv_start); } return (VT_EINVAL); } if (limit < resv_start) { if (efi_debug) { (void) fprintf(stderr, "efi_use_whole_disk: " "limit (%lli) < resv_start (%lli)\n", limit, resv_start); } return (VT_EINVAL); } difference = limit - resv_start; if (efi_debug) (void) fprintf(stderr, "efi_use_whole_disk: difference is %lli\n", difference); /* * Move the reserved partition. There is currently no data in * here except fabricated devids (which get generated via * efi_write()). So there is no need to copy data. */ efi_label->efi_parts[data_index].p_size += difference; efi_label->efi_parts[resv_index].p_start += difference; efi_label->efi_last_u_lba = efi_label->efi_last_lba - nblocks; /* * Rescanning the partition table in the kernel can result * in the device links to be removed (see comment in vdev_disk_open). * If BLKPG_RESIZE_PARTITION is available, then we can resize * the partition table online and avoid having to remove the device * links used by the pool. This provides a very deterministic * approach to resizing devices and does not require any * loops waiting for devices to reappear. */ #ifdef BLKPG_RESIZE_PARTITION /* * Delete the reserved partition since we're about to expand * the data partition and it would overlap with the reserved * partition. * NOTE: The starting index for the ioctl is 1 while for the * EFI partitions it's 0. For that reason we have to add one * whenever we make an ioctl call. */ rval = call_blkpg_ioctl(fd, BLKPG_DEL_PARTITION, 0, 0, resv_index + 1); if (rval != 0) goto out; /* * Expand the data partition */ rval = call_blkpg_ioctl(fd, BLKPG_RESIZE_PARTITION, efi_label->efi_parts[data_index].p_start * efi_label->efi_lbasize, efi_label->efi_parts[data_index].p_size * efi_label->efi_lbasize, data_index + 1); if (rval != 0) { (void) fprintf(stderr, "Unable to resize data " "partition: %d\n", rval); /* * Since we failed to resize, we need to reset the start * of the reserve partition and re-create it. */ efi_label->efi_parts[resv_index].p_start -= difference; } /* * Re-add the reserved partition. If we've expanded the data partition * then we'll move the reserve partition to the end of the data * partition. Otherwise, we'll recreate the partition in its original * location. Note that we do this as best-effort and ignore any * errors that may arise here. This will ensure that we finish writing * the EFI label. */ (void) call_blkpg_ioctl(fd, BLKPG_ADD_PARTITION, efi_label->efi_parts[resv_index].p_start * efi_label->efi_lbasize, efi_label->efi_parts[resv_index].p_size * efi_label->efi_lbasize, resv_index + 1); #endif /* * We're now ready to write the EFI label. */ if (rval == 0) { rval = efi_write(fd, efi_label); if (rval < 0 && efi_debug) { (void) fprintf(stderr, "efi_use_whole_disk:fail " "to write label, rval=%d\n", rval); } } out: efi_free(efi_label); return (rval); } /* * write EFI label and backup label */ int efi_write(int fd, struct dk_gpt *vtoc) { dk_efi_t dk_ioc; efi_gpt_t *efi; efi_gpe_t *efi_parts; int i, j; struct dk_cinfo dki_info; int rval; int md_flag = 0; int nblocks; diskaddr_t lba_backup_gpt_hdr; if ((rval = efi_get_info(fd, &dki_info)) != 0) return (rval); /* check if we are dealing with a metadevice */ if ((strncmp(dki_info.dki_cname, "pseudo", 7) == 0) && (strncmp(dki_info.dki_dname, "md", 3) == 0)) { md_flag = 1; } if (check_input(vtoc)) { /* * not valid; if it's a metadevice just pass it down * because SVM will do its own checking */ if (md_flag == 0) { return (VT_EINVAL); } } dk_ioc.dki_lba = 1; if (NBLOCKS(vtoc->efi_nparts, vtoc->efi_lbasize) < 34) { dk_ioc.dki_length = EFI_MIN_ARRAY_SIZE + vtoc->efi_lbasize; } else { dk_ioc.dki_length = NBLOCKS(vtoc->efi_nparts, vtoc->efi_lbasize) * vtoc->efi_lbasize; } /* * the number of blocks occupied by GUID partition entry array */ nblocks = dk_ioc.dki_length / vtoc->efi_lbasize - 1; /* * Backup GPT header is located on the block after GUID * partition entry array. Here, we calculate the address * for backup GPT header. */ lba_backup_gpt_hdr = vtoc->efi_last_u_lba + 1 + nblocks; if (posix_memalign((void **)&dk_ioc.dki_data, vtoc->efi_lbasize, dk_ioc.dki_length)) return (VT_ERROR); memset(dk_ioc.dki_data, 0, dk_ioc.dki_length); efi = dk_ioc.dki_data; /* stuff user's input into EFI struct */ efi->efi_gpt_Signature = LE_64(EFI_SIGNATURE); efi->efi_gpt_Revision = LE_32(vtoc->efi_version); /* 0x02000100 */ efi->efi_gpt_HeaderSize = LE_32(sizeof (struct efi_gpt) - LEN_EFI_PAD); efi->efi_gpt_Reserved1 = 0; efi->efi_gpt_MyLBA = LE_64(1ULL); efi->efi_gpt_AlternateLBA = LE_64(lba_backup_gpt_hdr); efi->efi_gpt_FirstUsableLBA = LE_64(vtoc->efi_first_u_lba); efi->efi_gpt_LastUsableLBA = LE_64(vtoc->efi_last_u_lba); efi->efi_gpt_PartitionEntryLBA = LE_64(2ULL); efi->efi_gpt_NumberOfPartitionEntries = LE_32(vtoc->efi_nparts); efi->efi_gpt_SizeOfPartitionEntry = LE_32(sizeof (struct efi_gpe)); UUID_LE_CONVERT(efi->efi_gpt_DiskGUID, vtoc->efi_disk_uguid); /* LINTED -- always longlong aligned */ efi_parts = (efi_gpe_t *)((char *)dk_ioc.dki_data + vtoc->efi_lbasize); for (i = 0; i < vtoc->efi_nparts; i++) { for (j = 0; j < sizeof (conversion_array) / sizeof (struct uuid_to_ptag); j++) { if (vtoc->efi_parts[i].p_tag == j) { UUID_LE_CONVERT( efi_parts[i].efi_gpe_PartitionTypeGUID, conversion_array[j].uuid); break; } } if (j == sizeof (conversion_array) / sizeof (struct uuid_to_ptag)) { /* * If we didn't have a matching uuid match, bail here. * Don't write a label with unknown uuid. */ if (efi_debug) { (void) fprintf(stderr, "Unknown uuid for p_tag %d\n", vtoc->efi_parts[i].p_tag); } return (VT_EINVAL); } /* Zero's should be written for empty partitions */ if (vtoc->efi_parts[i].p_tag == V_UNASSIGNED) continue; efi_parts[i].efi_gpe_StartingLBA = LE_64(vtoc->efi_parts[i].p_start); efi_parts[i].efi_gpe_EndingLBA = LE_64(vtoc->efi_parts[i].p_start + vtoc->efi_parts[i].p_size - 1); efi_parts[i].efi_gpe_Attributes.PartitionAttrs = LE_16(vtoc->efi_parts[i].p_flag); for (j = 0; j < EFI_PART_NAME_LEN; j++) { efi_parts[i].efi_gpe_PartitionName[j] = LE_16((ushort_t)vtoc->efi_parts[i].p_name[j]); } if ((vtoc->efi_parts[i].p_tag != V_UNASSIGNED) && uuid_is_null((uchar_t *)&vtoc->efi_parts[i].p_uguid)) { (void) uuid_generate((uchar_t *) &vtoc->efi_parts[i].p_uguid); } memcpy(&efi_parts[i].efi_gpe_UniquePartitionGUID, &vtoc->efi_parts[i].p_uguid, sizeof (uuid_t)); } efi->efi_gpt_PartitionEntryArrayCRC32 = LE_32(efi_crc32((unsigned char *)efi_parts, vtoc->efi_nparts * (int)sizeof (struct efi_gpe))); efi->efi_gpt_HeaderCRC32 = LE_32(efi_crc32((unsigned char *)efi, LE_32(efi->efi_gpt_HeaderSize))); if (efi_ioctl(fd, DKIOCSETEFI, &dk_ioc) == -1) { free(dk_ioc.dki_data); switch (errno) { case EIO: return (VT_EIO); case EINVAL: return (VT_EINVAL); default: return (VT_ERROR); } } /* if it's a metadevice we're done */ if (md_flag) { free(dk_ioc.dki_data); return (0); } /* write backup partition array */ dk_ioc.dki_lba = vtoc->efi_last_u_lba + 1; dk_ioc.dki_length -= vtoc->efi_lbasize; /* LINTED */ dk_ioc.dki_data = (efi_gpt_t *)((char *)dk_ioc.dki_data + vtoc->efi_lbasize); if (efi_ioctl(fd, DKIOCSETEFI, &dk_ioc) == -1) { /* * we wrote the primary label okay, so don't fail */ if (efi_debug) { (void) fprintf(stderr, "write of backup partitions to block %llu " "failed, errno %d\n", vtoc->efi_last_u_lba + 1, errno); } } /* * now swap MyLBA and AlternateLBA fields and write backup * partition table header */ dk_ioc.dki_lba = lba_backup_gpt_hdr; dk_ioc.dki_length = vtoc->efi_lbasize; /* LINTED */ dk_ioc.dki_data = (efi_gpt_t *)((char *)dk_ioc.dki_data - vtoc->efi_lbasize); efi->efi_gpt_AlternateLBA = LE_64(1ULL); efi->efi_gpt_MyLBA = LE_64(lba_backup_gpt_hdr); efi->efi_gpt_PartitionEntryLBA = LE_64(vtoc->efi_last_u_lba + 1); efi->efi_gpt_HeaderCRC32 = 0; efi->efi_gpt_HeaderCRC32 = LE_32(efi_crc32((unsigned char *)dk_ioc.dki_data, LE_32(efi->efi_gpt_HeaderSize))); if (efi_ioctl(fd, DKIOCSETEFI, &dk_ioc) == -1) { if (efi_debug) { (void) fprintf(stderr, "write of backup header to block %llu failed, " "errno %d\n", lba_backup_gpt_hdr, errno); } } /* write the PMBR */ (void) write_pmbr(fd, vtoc); free(dk_ioc.dki_data); return (0); } void efi_free(struct dk_gpt *ptr) { free(ptr); } void efi_err_check(struct dk_gpt *vtoc) { int resv_part = -1; int i, j; diskaddr_t istart, jstart, isize, jsize, endsect; int overlap = 0; /* * make sure no partitions overlap */ for (i = 0; i < vtoc->efi_nparts; i++) { /* It can't be unassigned and have an actual size */ if ((vtoc->efi_parts[i].p_tag == V_UNASSIGNED) && (vtoc->efi_parts[i].p_size != 0)) { (void) fprintf(stderr, "partition %d is \"unassigned\" but has a size " "of %llu\n", i, vtoc->efi_parts[i].p_size); } if (vtoc->efi_parts[i].p_tag == V_UNASSIGNED) { continue; } if (vtoc->efi_parts[i].p_tag == V_RESERVED) { if (resv_part != -1) { (void) fprintf(stderr, "found duplicate reserved partition at " "%d\n", i); } resv_part = i; if (vtoc->efi_parts[i].p_size != EFI_MIN_RESV_SIZE) (void) fprintf(stderr, "Warning: reserved partition size must " "be %d sectors\n", EFI_MIN_RESV_SIZE); } if ((vtoc->efi_parts[i].p_start < vtoc->efi_first_u_lba) || (vtoc->efi_parts[i].p_start > vtoc->efi_last_u_lba)) { (void) fprintf(stderr, "Partition %d starts at %llu\n", i, vtoc->efi_parts[i].p_start); (void) fprintf(stderr, "It must be between %llu and %llu.\n", vtoc->efi_first_u_lba, vtoc->efi_last_u_lba); } if ((vtoc->efi_parts[i].p_start + vtoc->efi_parts[i].p_size < vtoc->efi_first_u_lba) || (vtoc->efi_parts[i].p_start + vtoc->efi_parts[i].p_size > vtoc->efi_last_u_lba + 1)) { (void) fprintf(stderr, "Partition %d ends at %llu\n", i, vtoc->efi_parts[i].p_start + vtoc->efi_parts[i].p_size); (void) fprintf(stderr, "It must be between %llu and %llu.\n", vtoc->efi_first_u_lba, vtoc->efi_last_u_lba); } for (j = 0; j < vtoc->efi_nparts; j++) { isize = vtoc->efi_parts[i].p_size; jsize = vtoc->efi_parts[j].p_size; istart = vtoc->efi_parts[i].p_start; jstart = vtoc->efi_parts[j].p_start; if ((i != j) && (isize != 0) && (jsize != 0)) { endsect = jstart + jsize -1; if ((jstart <= istart) && (istart <= endsect)) { if (!overlap) { (void) fprintf(stderr, "label error: EFI Labels do not " "support overlapping partitions\n"); } (void) fprintf(stderr, "Partition %d overlaps partition " "%d.\n", i, j); overlap = 1; } } } } /* make sure there is a reserved partition */ if (resv_part == -1) { (void) fprintf(stderr, "no reserved partition found\n"); } } diff --git a/lib/libzfs/libzfs_dataset.c b/lib/libzfs/libzfs_dataset.c index 133b3b358831..f8a61c64261f 100644 --- a/lib/libzfs/libzfs_dataset.c +++ b/lib/libzfs/libzfs_dataset.c @@ -1,5592 +1,5592 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2019 Joyent, Inc. * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright (c) 2012 DEY Storage Systems, Inc. All rights reserved. * Copyright (c) 2012 Pawel Jakub Dawidek . * Copyright (c) 2013 Martin Matuska. All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved. * Copyright 2017 Nexenta Systems, Inc. * Copyright 2016 Igor Kozhukhov * Copyright 2017-2018 RackTop Systems. * Copyright (c) 2019 Datto Inc. * Copyright (c) 2019, loli10K * Copyright (c) 2021 Matt Fiddaman */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef HAVE_IDMAP #include #include #include #endif /* HAVE_IDMAP */ #include #include #include #include #include #include #include "zfs_namecheck.h" #include "zfs_prop.h" #include "libzfs_impl.h" #include "zfs_deleg.h" static int userquota_propname_decode(const char *propname, boolean_t zoned, zfs_userquota_prop_t *typep, char *domain, int domainlen, uint64_t *ridp); /* * Given a single type (not a mask of types), return the type in a human * readable form. */ const char * zfs_type_to_name(zfs_type_t type) { switch (type) { case ZFS_TYPE_FILESYSTEM: return (dgettext(TEXT_DOMAIN, "filesystem")); case ZFS_TYPE_SNAPSHOT: return (dgettext(TEXT_DOMAIN, "snapshot")); case ZFS_TYPE_VOLUME: return (dgettext(TEXT_DOMAIN, "volume")); case ZFS_TYPE_POOL: return (dgettext(TEXT_DOMAIN, "pool")); case ZFS_TYPE_BOOKMARK: return (dgettext(TEXT_DOMAIN, "bookmark")); default: assert(!"unhandled zfs_type_t"); } return (NULL); } /* * Validate a ZFS path. This is used even before trying to open the dataset, to * provide a more meaningful error message. We call zfs_error_aux() to * explain exactly why the name was not valid. */ int zfs_validate_name(libzfs_handle_t *hdl, const char *path, int type, boolean_t modifying) { namecheck_err_t why; char what; if (!(type & ZFS_TYPE_SNAPSHOT) && strchr(path, '@') != NULL) { if (hdl != NULL) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "snapshot delimiter '@' is not expected here")); return (0); } if (type == ZFS_TYPE_SNAPSHOT && strchr(path, '@') == NULL) { if (hdl != NULL) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "missing '@' delimiter in snapshot name")); return (0); } if (!(type & ZFS_TYPE_BOOKMARK) && strchr(path, '#') != NULL) { if (hdl != NULL) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "bookmark delimiter '#' is not expected here")); return (0); } if (type == ZFS_TYPE_BOOKMARK && strchr(path, '#') == NULL) { if (hdl != NULL) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "missing '#' delimiter in bookmark name")); return (0); } if (modifying && strchr(path, '%') != NULL) { if (hdl != NULL) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid character %c in name"), '%'); return (0); } if (entity_namecheck(path, &why, &what) != 0) { if (hdl != NULL) { switch (why) { case NAME_ERR_TOOLONG: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "name is too long")); break; case NAME_ERR_LEADING_SLASH: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "leading slash in name")); break; case NAME_ERR_EMPTY_COMPONENT: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "empty component or misplaced '@'" " or '#' delimiter in name")); break; case NAME_ERR_TRAILING_SLASH: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "trailing slash in name")); break; case NAME_ERR_INVALCHAR: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid character " "'%c' in name"), what); break; case NAME_ERR_MULTIPLE_DELIMITERS: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "multiple '@' and/or '#' delimiters in " "name")); break; case NAME_ERR_NOLETTER: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool doesn't begin with a letter")); break; case NAME_ERR_RESERVED: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "name is reserved")); break; case NAME_ERR_DISKLIKE: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "reserved disk name")); break; case NAME_ERR_SELF_REF: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "self reference, '.' is found in name")); break; case NAME_ERR_PARENT_REF: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "parent reference, '..' is found in name")); break; default: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "(%d) not defined"), why); break; } } return (0); } return (-1); } int zfs_name_valid(const char *name, zfs_type_t type) { if (type == ZFS_TYPE_POOL) return (zpool_name_valid(NULL, B_FALSE, name)); return (zfs_validate_name(NULL, name, type, B_FALSE)); } /* * This function takes the raw DSL properties, and filters out the user-defined * properties into a separate nvlist. */ static nvlist_t * process_user_props(zfs_handle_t *zhp, nvlist_t *props) { libzfs_handle_t *hdl = zhp->zfs_hdl; nvpair_t *elem; nvlist_t *nvl; if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0) { (void) no_memory(hdl); return (NULL); } elem = NULL; while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { if (!zfs_prop_user(nvpair_name(elem))) continue; nvlist_t *propval = fnvpair_value_nvlist(elem); if (nvlist_add_nvlist(nvl, nvpair_name(elem), propval) != 0) { nvlist_free(nvl); (void) no_memory(hdl); return (NULL); } } return (nvl); } static zpool_handle_t * zpool_add_handle(zfs_handle_t *zhp, const char *pool_name) { libzfs_handle_t *hdl = zhp->zfs_hdl; zpool_handle_t *zph; if ((zph = zpool_open_canfail(hdl, pool_name)) != NULL) { if (hdl->libzfs_pool_handles != NULL) zph->zpool_next = hdl->libzfs_pool_handles; hdl->libzfs_pool_handles = zph; } return (zph); } static zpool_handle_t * zpool_find_handle(zfs_handle_t *zhp, const char *pool_name, int len) { libzfs_handle_t *hdl = zhp->zfs_hdl; zpool_handle_t *zph = hdl->libzfs_pool_handles; while ((zph != NULL) && (strncmp(pool_name, zpool_get_name(zph), len) != 0)) zph = zph->zpool_next; return (zph); } /* * Returns a handle to the pool that contains the provided dataset. * If a handle to that pool already exists then that handle is returned. * Otherwise, a new handle is created and added to the list of handles. */ static zpool_handle_t * zpool_handle(zfs_handle_t *zhp) { char *pool_name; int len; zpool_handle_t *zph; len = strcspn(zhp->zfs_name, "/@#") + 1; pool_name = zfs_alloc(zhp->zfs_hdl, len); (void) strlcpy(pool_name, zhp->zfs_name, len); zph = zpool_find_handle(zhp, pool_name, len); if (zph == NULL) zph = zpool_add_handle(zhp, pool_name); free(pool_name); return (zph); } void zpool_free_handles(libzfs_handle_t *hdl) { zpool_handle_t *next, *zph = hdl->libzfs_pool_handles; while (zph != NULL) { next = zph->zpool_next; zpool_close(zph); zph = next; } hdl->libzfs_pool_handles = NULL; } /* * Utility function to gather stats (objset and zpl) for the given object. */ static int get_stats_ioctl(zfs_handle_t *zhp, zfs_cmd_t *zc) { libzfs_handle_t *hdl = zhp->zfs_hdl; (void) strlcpy(zc->zc_name, zhp->zfs_name, sizeof (zc->zc_name)); while (zfs_ioctl(hdl, ZFS_IOC_OBJSET_STATS, zc) != 0) { if (errno == ENOMEM) zcmd_expand_dst_nvlist(hdl, zc); else return (-1); } return (0); } /* * Utility function to get the received properties of the given object. */ static int get_recvd_props_ioctl(zfs_handle_t *zhp) { libzfs_handle_t *hdl = zhp->zfs_hdl; nvlist_t *recvdprops; zfs_cmd_t zc = {"\0"}; int err; zcmd_alloc_dst_nvlist(hdl, &zc, 0); (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); while (zfs_ioctl(hdl, ZFS_IOC_OBJSET_RECVD_PROPS, &zc) != 0) { if (errno == ENOMEM) zcmd_expand_dst_nvlist(hdl, &zc); else { zcmd_free_nvlists(&zc); return (-1); } } err = zcmd_read_dst_nvlist(zhp->zfs_hdl, &zc, &recvdprops); zcmd_free_nvlists(&zc); if (err != 0) return (-1); nvlist_free(zhp->zfs_recvd_props); zhp->zfs_recvd_props = recvdprops; return (0); } static int put_stats_zhdl(zfs_handle_t *zhp, zfs_cmd_t *zc) { nvlist_t *allprops, *userprops; zhp->zfs_dmustats = zc->zc_objset_stats; /* structure assignment */ if (zcmd_read_dst_nvlist(zhp->zfs_hdl, zc, &allprops) != 0) { return (-1); } /* * XXX Why do we store the user props separately, in addition to * storing them in zfs_props? */ if ((userprops = process_user_props(zhp, allprops)) == NULL) { nvlist_free(allprops); return (-1); } nvlist_free(zhp->zfs_props); nvlist_free(zhp->zfs_user_props); zhp->zfs_props = allprops; zhp->zfs_user_props = userprops; return (0); } static int get_stats(zfs_handle_t *zhp) { int rc = 0; zfs_cmd_t zc = {"\0"}; zcmd_alloc_dst_nvlist(zhp->zfs_hdl, &zc, 0); if (get_stats_ioctl(zhp, &zc) != 0) rc = -1; else if (put_stats_zhdl(zhp, &zc) != 0) rc = -1; zcmd_free_nvlists(&zc); return (rc); } /* * Refresh the properties currently stored in the handle. */ void zfs_refresh_properties(zfs_handle_t *zhp) { (void) get_stats(zhp); } /* * Makes a handle from the given dataset name. Used by zfs_open() and * zfs_iter_* to create child handles on the fly. */ static int make_dataset_handle_common(zfs_handle_t *zhp, zfs_cmd_t *zc) { if (put_stats_zhdl(zhp, zc) != 0) return (-1); /* * We've managed to open the dataset and gather statistics. Determine * the high-level type. */ if (zhp->zfs_dmustats.dds_type == DMU_OST_ZVOL) { zhp->zfs_head_type = ZFS_TYPE_VOLUME; } else if (zhp->zfs_dmustats.dds_type == DMU_OST_ZFS) { zhp->zfs_head_type = ZFS_TYPE_FILESYSTEM; } else if (zhp->zfs_dmustats.dds_type == DMU_OST_OTHER) { errno = EINVAL; return (-1); } else if (zhp->zfs_dmustats.dds_inconsistent) { errno = EBUSY; return (-1); } else { abort(); } if (zhp->zfs_dmustats.dds_is_snapshot) zhp->zfs_type = ZFS_TYPE_SNAPSHOT; else if (zhp->zfs_dmustats.dds_type == DMU_OST_ZVOL) zhp->zfs_type = ZFS_TYPE_VOLUME; else if (zhp->zfs_dmustats.dds_type == DMU_OST_ZFS) zhp->zfs_type = ZFS_TYPE_FILESYSTEM; else abort(); /* we should never see any other types */ if ((zhp->zpool_hdl = zpool_handle(zhp)) == NULL) return (-1); return (0); } zfs_handle_t * make_dataset_handle(libzfs_handle_t *hdl, const char *path) { zfs_cmd_t zc = {"\0"}; zfs_handle_t *zhp = calloc(1, sizeof (zfs_handle_t)); if (zhp == NULL) return (NULL); zhp->zfs_hdl = hdl; (void) strlcpy(zhp->zfs_name, path, sizeof (zhp->zfs_name)); zcmd_alloc_dst_nvlist(hdl, &zc, 0); if (get_stats_ioctl(zhp, &zc) == -1) { zcmd_free_nvlists(&zc); free(zhp); return (NULL); } if (make_dataset_handle_common(zhp, &zc) == -1) { free(zhp); zhp = NULL; } zcmd_free_nvlists(&zc); return (zhp); } zfs_handle_t * make_dataset_handle_zc(libzfs_handle_t *hdl, zfs_cmd_t *zc) { zfs_handle_t *zhp = calloc(1, sizeof (zfs_handle_t)); if (zhp == NULL) return (NULL); zhp->zfs_hdl = hdl; (void) strlcpy(zhp->zfs_name, zc->zc_name, sizeof (zhp->zfs_name)); if (make_dataset_handle_common(zhp, zc) == -1) { free(zhp); return (NULL); } return (zhp); } zfs_handle_t * make_dataset_simple_handle_zc(zfs_handle_t *pzhp, zfs_cmd_t *zc) { zfs_handle_t *zhp = calloc(1, sizeof (zfs_handle_t)); if (zhp == NULL) return (NULL); zhp->zfs_hdl = pzhp->zfs_hdl; (void) strlcpy(zhp->zfs_name, zc->zc_name, sizeof (zhp->zfs_name)); zhp->zfs_head_type = pzhp->zfs_type; zhp->zfs_type = ZFS_TYPE_SNAPSHOT; zhp->zpool_hdl = zpool_handle(zhp); zhp->zfs_dmustats = zc->zc_objset_stats; return (zhp); } zfs_handle_t * zfs_handle_dup(zfs_handle_t *zhp_orig) { zfs_handle_t *zhp = calloc(1, sizeof (zfs_handle_t)); if (zhp == NULL) return (NULL); zhp->zfs_hdl = zhp_orig->zfs_hdl; zhp->zpool_hdl = zhp_orig->zpool_hdl; (void) strlcpy(zhp->zfs_name, zhp_orig->zfs_name, sizeof (zhp->zfs_name)); zhp->zfs_type = zhp_orig->zfs_type; zhp->zfs_head_type = zhp_orig->zfs_head_type; zhp->zfs_dmustats = zhp_orig->zfs_dmustats; if (zhp_orig->zfs_props != NULL) { if (nvlist_dup(zhp_orig->zfs_props, &zhp->zfs_props, 0) != 0) { (void) no_memory(zhp->zfs_hdl); zfs_close(zhp); return (NULL); } } if (zhp_orig->zfs_user_props != NULL) { if (nvlist_dup(zhp_orig->zfs_user_props, &zhp->zfs_user_props, 0) != 0) { (void) no_memory(zhp->zfs_hdl); zfs_close(zhp); return (NULL); } } if (zhp_orig->zfs_recvd_props != NULL) { if (nvlist_dup(zhp_orig->zfs_recvd_props, &zhp->zfs_recvd_props, 0)) { (void) no_memory(zhp->zfs_hdl); zfs_close(zhp); return (NULL); } } zhp->zfs_mntcheck = zhp_orig->zfs_mntcheck; if (zhp_orig->zfs_mntopts != NULL) { zhp->zfs_mntopts = zfs_strdup(zhp_orig->zfs_hdl, zhp_orig->zfs_mntopts); } zhp->zfs_props_table = zhp_orig->zfs_props_table; return (zhp); } boolean_t zfs_bookmark_exists(const char *path) { nvlist_t *bmarks; nvlist_t *props; char fsname[ZFS_MAX_DATASET_NAME_LEN]; char *bmark_name; char *pound; int err; boolean_t rv; (void) strlcpy(fsname, path, sizeof (fsname)); pound = strchr(fsname, '#'); if (pound == NULL) return (B_FALSE); *pound = '\0'; bmark_name = pound + 1; props = fnvlist_alloc(); err = lzc_get_bookmarks(fsname, props, &bmarks); nvlist_free(props); if (err != 0) { nvlist_free(bmarks); return (B_FALSE); } rv = nvlist_exists(bmarks, bmark_name); nvlist_free(bmarks); return (rv); } zfs_handle_t * make_bookmark_handle(zfs_handle_t *parent, const char *path, nvlist_t *bmark_props) { zfs_handle_t *zhp = calloc(1, sizeof (zfs_handle_t)); if (zhp == NULL) return (NULL); /* Fill in the name. */ zhp->zfs_hdl = parent->zfs_hdl; (void) strlcpy(zhp->zfs_name, path, sizeof (zhp->zfs_name)); /* Set the property lists. */ if (nvlist_dup(bmark_props, &zhp->zfs_props, 0) != 0) { free(zhp); return (NULL); } /* Set the types. */ zhp->zfs_head_type = parent->zfs_head_type; zhp->zfs_type = ZFS_TYPE_BOOKMARK; if ((zhp->zpool_hdl = zpool_handle(zhp)) == NULL) { nvlist_free(zhp->zfs_props); free(zhp); return (NULL); } return (zhp); } struct zfs_open_bookmarks_cb_data { const char *path; zfs_handle_t *zhp; }; static int zfs_open_bookmarks_cb(zfs_handle_t *zhp, void *data) { struct zfs_open_bookmarks_cb_data *dp = data; /* * Is it the one we are looking for? */ if (strcmp(dp->path, zfs_get_name(zhp)) == 0) { /* * We found it. Save it and let the caller know we are done. */ dp->zhp = zhp; return (EEXIST); } /* * Not found. Close the handle and ask for another one. */ zfs_close(zhp); return (0); } /* * Opens the given snapshot, bookmark, filesystem, or volume. The 'types' * argument is a mask of acceptable types. The function will print an * appropriate error message and return NULL if it can't be opened. */ zfs_handle_t * zfs_open(libzfs_handle_t *hdl, const char *path, int types) { zfs_handle_t *zhp; char errbuf[ERRBUFLEN]; char *bookp; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot open '%s'"), path); /* * Validate the name before we even try to open it. */ if (!zfs_validate_name(hdl, path, types, B_FALSE)) { (void) zfs_error(hdl, EZFS_INVALIDNAME, errbuf); return (NULL); } /* * Bookmarks needs to be handled separately. */ bookp = strchr(path, '#'); if (bookp == NULL) { /* * Try to get stats for the dataset, which will tell us if it * exists. */ errno = 0; if ((zhp = make_dataset_handle(hdl, path)) == NULL) { (void) zfs_standard_error(hdl, errno, errbuf); return (NULL); } } else { char dsname[ZFS_MAX_DATASET_NAME_LEN]; zfs_handle_t *pzhp; struct zfs_open_bookmarks_cb_data cb_data = {path, NULL}; /* * We need to cut out '#' and everything after '#' * to get the parent dataset name only. */ assert(bookp - path < sizeof (dsname)); (void) strlcpy(dsname, path, MIN(sizeof (dsname), bookp - path + 1)); /* * Create handle for the parent dataset. */ errno = 0; if ((pzhp = make_dataset_handle(hdl, dsname)) == NULL) { (void) zfs_standard_error(hdl, errno, errbuf); return (NULL); } /* * Iterate bookmarks to find the right one. */ errno = 0; if ((zfs_iter_bookmarks(pzhp, zfs_open_bookmarks_cb, &cb_data) == 0) && (cb_data.zhp == NULL)) { (void) zfs_error(hdl, EZFS_NOENT, errbuf); zfs_close(pzhp); return (NULL); } if (cb_data.zhp == NULL) { (void) zfs_standard_error(hdl, errno, errbuf); zfs_close(pzhp); return (NULL); } zhp = cb_data.zhp; /* * Cleanup. */ zfs_close(pzhp); } if (!(types & zhp->zfs_type)) { (void) zfs_error(hdl, EZFS_BADTYPE, errbuf); zfs_close(zhp); return (NULL); } return (zhp); } /* * Release a ZFS handle. Nothing to do but free the associated memory. */ void zfs_close(zfs_handle_t *zhp) { if (zhp->zfs_mntopts) free(zhp->zfs_mntopts); nvlist_free(zhp->zfs_props); nvlist_free(zhp->zfs_user_props); nvlist_free(zhp->zfs_recvd_props); free(zhp); } typedef struct mnttab_node { struct mnttab mtn_mt; avl_node_t mtn_node; } mnttab_node_t; static int libzfs_mnttab_cache_compare(const void *arg1, const void *arg2) { const mnttab_node_t *mtn1 = (const mnttab_node_t *)arg1; const mnttab_node_t *mtn2 = (const mnttab_node_t *)arg2; int rv; rv = strcmp(mtn1->mtn_mt.mnt_special, mtn2->mtn_mt.mnt_special); return (TREE_ISIGN(rv)); } void libzfs_mnttab_init(libzfs_handle_t *hdl) { pthread_mutex_init(&hdl->libzfs_mnttab_cache_lock, NULL); assert(avl_numnodes(&hdl->libzfs_mnttab_cache) == 0); avl_create(&hdl->libzfs_mnttab_cache, libzfs_mnttab_cache_compare, sizeof (mnttab_node_t), offsetof(mnttab_node_t, mtn_node)); } static int libzfs_mnttab_update(libzfs_handle_t *hdl) { FILE *mnttab; struct mnttab entry; if ((mnttab = fopen(MNTTAB, "re")) == NULL) return (ENOENT); while (getmntent(mnttab, &entry) == 0) { mnttab_node_t *mtn; avl_index_t where; if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0) continue; mtn = zfs_alloc(hdl, sizeof (mnttab_node_t)); mtn->mtn_mt.mnt_special = zfs_strdup(hdl, entry.mnt_special); mtn->mtn_mt.mnt_mountp = zfs_strdup(hdl, entry.mnt_mountp); mtn->mtn_mt.mnt_fstype = zfs_strdup(hdl, entry.mnt_fstype); mtn->mtn_mt.mnt_mntopts = zfs_strdup(hdl, entry.mnt_mntopts); /* Exclude duplicate mounts */ if (avl_find(&hdl->libzfs_mnttab_cache, mtn, &where) != NULL) { free(mtn->mtn_mt.mnt_special); free(mtn->mtn_mt.mnt_mountp); free(mtn->mtn_mt.mnt_fstype); free(mtn->mtn_mt.mnt_mntopts); free(mtn); continue; } avl_add(&hdl->libzfs_mnttab_cache, mtn); } (void) fclose(mnttab); return (0); } void libzfs_mnttab_fini(libzfs_handle_t *hdl) { void *cookie = NULL; mnttab_node_t *mtn; while ((mtn = avl_destroy_nodes(&hdl->libzfs_mnttab_cache, &cookie)) != NULL) { free(mtn->mtn_mt.mnt_special); free(mtn->mtn_mt.mnt_mountp); free(mtn->mtn_mt.mnt_fstype); free(mtn->mtn_mt.mnt_mntopts); free(mtn); } avl_destroy(&hdl->libzfs_mnttab_cache); (void) pthread_mutex_destroy(&hdl->libzfs_mnttab_cache_lock); } void libzfs_mnttab_cache(libzfs_handle_t *hdl, boolean_t enable) { hdl->libzfs_mnttab_enable = enable; } int libzfs_mnttab_find(libzfs_handle_t *hdl, const char *fsname, struct mnttab *entry) { FILE *mnttab; mnttab_node_t find; mnttab_node_t *mtn; int ret = ENOENT; if (!hdl->libzfs_mnttab_enable) { struct mnttab srch = { 0 }; if (avl_numnodes(&hdl->libzfs_mnttab_cache)) libzfs_mnttab_fini(hdl); if ((mnttab = fopen(MNTTAB, "re")) == NULL) return (ENOENT); srch.mnt_special = (char *)fsname; srch.mnt_fstype = (char *)MNTTYPE_ZFS; ret = getmntany(mnttab, entry, &srch) ? ENOENT : 0; (void) fclose(mnttab); return (ret); } pthread_mutex_lock(&hdl->libzfs_mnttab_cache_lock); if (avl_numnodes(&hdl->libzfs_mnttab_cache) == 0) { int error; if ((error = libzfs_mnttab_update(hdl)) != 0) { pthread_mutex_unlock(&hdl->libzfs_mnttab_cache_lock); return (error); } } find.mtn_mt.mnt_special = (char *)fsname; mtn = avl_find(&hdl->libzfs_mnttab_cache, &find, NULL); if (mtn) { *entry = mtn->mtn_mt; ret = 0; } pthread_mutex_unlock(&hdl->libzfs_mnttab_cache_lock); return (ret); } void libzfs_mnttab_add(libzfs_handle_t *hdl, const char *special, const char *mountp, const char *mntopts) { mnttab_node_t *mtn; pthread_mutex_lock(&hdl->libzfs_mnttab_cache_lock); if (avl_numnodes(&hdl->libzfs_mnttab_cache) != 0) { mtn = zfs_alloc(hdl, sizeof (mnttab_node_t)); mtn->mtn_mt.mnt_special = zfs_strdup(hdl, special); mtn->mtn_mt.mnt_mountp = zfs_strdup(hdl, mountp); mtn->mtn_mt.mnt_fstype = zfs_strdup(hdl, MNTTYPE_ZFS); mtn->mtn_mt.mnt_mntopts = zfs_strdup(hdl, mntopts); /* * Another thread may have already added this entry * via libzfs_mnttab_update. If so we should skip it. */ if (avl_find(&hdl->libzfs_mnttab_cache, mtn, NULL) != NULL) { free(mtn->mtn_mt.mnt_special); free(mtn->mtn_mt.mnt_mountp); free(mtn->mtn_mt.mnt_fstype); free(mtn->mtn_mt.mnt_mntopts); free(mtn); } else { avl_add(&hdl->libzfs_mnttab_cache, mtn); } } pthread_mutex_unlock(&hdl->libzfs_mnttab_cache_lock); } void libzfs_mnttab_remove(libzfs_handle_t *hdl, const char *fsname) { mnttab_node_t find; mnttab_node_t *ret; pthread_mutex_lock(&hdl->libzfs_mnttab_cache_lock); find.mtn_mt.mnt_special = (char *)fsname; if ((ret = avl_find(&hdl->libzfs_mnttab_cache, (void *)&find, NULL)) != NULL) { avl_remove(&hdl->libzfs_mnttab_cache, ret); free(ret->mtn_mt.mnt_special); free(ret->mtn_mt.mnt_mountp); free(ret->mtn_mt.mnt_fstype); free(ret->mtn_mt.mnt_mntopts); free(ret); } pthread_mutex_unlock(&hdl->libzfs_mnttab_cache_lock); } int zfs_spa_version(zfs_handle_t *zhp, int *spa_version) { zpool_handle_t *zpool_handle = zhp->zpool_hdl; if (zpool_handle == NULL) return (-1); *spa_version = zpool_get_prop_int(zpool_handle, ZPOOL_PROP_VERSION, NULL); return (0); } /* * The choice of reservation property depends on the SPA version. */ static int zfs_which_resv_prop(zfs_handle_t *zhp, zfs_prop_t *resv_prop) { int spa_version; if (zfs_spa_version(zhp, &spa_version) < 0) return (-1); if (spa_version >= SPA_VERSION_REFRESERVATION) *resv_prop = ZFS_PROP_REFRESERVATION; else *resv_prop = ZFS_PROP_RESERVATION; return (0); } /* * Given an nvlist of properties to set, validates that they are correct, and * parses any numeric properties (index, boolean, etc) if they are specified as * strings. */ nvlist_t * zfs_valid_proplist(libzfs_handle_t *hdl, zfs_type_t type, nvlist_t *nvl, uint64_t zoned, zfs_handle_t *zhp, zpool_handle_t *zpool_hdl, boolean_t key_params_ok, const char *errbuf) { nvpair_t *elem; uint64_t intval; char *strval; zfs_prop_t prop; nvlist_t *ret; int chosen_normal = -1; int chosen_utf = -1; if (nvlist_alloc(&ret, NV_UNIQUE_NAME, 0) != 0) { (void) no_memory(hdl); return (NULL); } /* * Make sure this property is valid and applies to this type. */ elem = NULL; while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) { const char *propname = nvpair_name(elem); prop = zfs_name_to_prop(propname); if (prop == ZPROP_USERPROP && zfs_prop_user(propname)) { /* * This is a user property: make sure it's a * string, and that it's less than ZAP_MAXNAMELEN. */ if (nvpair_type(elem) != DATA_TYPE_STRING) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' must be a string"), propname); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } if (strlen(nvpair_name(elem)) >= ZAP_MAXNAMELEN) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "property name '%s' is too long"), propname); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } (void) nvpair_value_string(elem, &strval); if (nvlist_add_string(ret, propname, strval) != 0) { (void) no_memory(hdl); goto error; } continue; } /* * Currently, only user properties can be modified on * snapshots. */ if (type == ZFS_TYPE_SNAPSHOT) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "this property can not be modified for snapshots")); (void) zfs_error(hdl, EZFS_PROPTYPE, errbuf); goto error; } if (prop == ZPROP_USERPROP && zfs_prop_userquota(propname)) { zfs_userquota_prop_t uqtype; char *newpropname = NULL; char domain[128]; uint64_t rid; uint64_t valary[3]; int rc; if (userquota_propname_decode(propname, zoned, &uqtype, domain, sizeof (domain), &rid) != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' has an invalid user/group name"), propname); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } if (uqtype != ZFS_PROP_USERQUOTA && uqtype != ZFS_PROP_GROUPQUOTA && uqtype != ZFS_PROP_USEROBJQUOTA && uqtype != ZFS_PROP_GROUPOBJQUOTA && uqtype != ZFS_PROP_PROJECTQUOTA && uqtype != ZFS_PROP_PROJECTOBJQUOTA) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' is readonly"), propname); (void) zfs_error(hdl, EZFS_PROPREADONLY, errbuf); goto error; } if (nvpair_type(elem) == DATA_TYPE_STRING) { (void) nvpair_value_string(elem, &strval); if (strcmp(strval, "none") == 0) { intval = 0; } else if (zfs_nicestrtonum(hdl, strval, &intval) != 0) { (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { (void) nvpair_value_uint64(elem, &intval); if (intval == 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "use 'none' to disable " "{user|group|project}quota")); goto error; } } else { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' must be a number"), propname); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } /* * Encode the prop name as * userquota@-domain, to make it easy * for the kernel to decode. */ rc = asprintf(&newpropname, "%s%llx-%s", zfs_userquota_prop_prefixes[uqtype], (longlong_t)rid, domain); if (rc == -1 || newpropname == NULL) { (void) no_memory(hdl); goto error; } valary[0] = uqtype; valary[1] = rid; valary[2] = intval; if (nvlist_add_uint64_array(ret, newpropname, valary, 3) != 0) { free(newpropname); (void) no_memory(hdl); goto error; } free(newpropname); continue; } else if (prop == ZPROP_USERPROP && zfs_prop_written(propname)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' is readonly"), propname); (void) zfs_error(hdl, EZFS_PROPREADONLY, errbuf); goto error; } if (prop == ZPROP_INVAL) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid property '%s'"), propname); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } if (!zfs_prop_valid_for_type(prop, type, B_FALSE)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' does not " "apply to datasets of this type"), propname); (void) zfs_error(hdl, EZFS_PROPTYPE, errbuf); goto error; } if (zfs_prop_readonly(prop) && !(zfs_prop_setonce(prop) && zhp == NULL) && !(zfs_prop_encryption_key_param(prop) && key_params_ok)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' is readonly"), propname); (void) zfs_error(hdl, EZFS_PROPREADONLY, errbuf); goto error; } if (zprop_parse_value(hdl, elem, prop, type, ret, &strval, &intval, errbuf) != 0) goto error; /* * Perform some additional checks for specific properties. */ switch (prop) { case ZFS_PROP_VERSION: { int version; if (zhp == NULL) break; version = zfs_prop_get_int(zhp, ZFS_PROP_VERSION); if (intval < version) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Can not downgrade; already at version %u"), version); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } break; } case ZFS_PROP_VOLBLOCKSIZE: case ZFS_PROP_RECORDSIZE: { int maxbs = SPA_MAXBLOCKSIZE; char buf[64]; if (zpool_hdl != NULL) { maxbs = zpool_get_prop_int(zpool_hdl, ZPOOL_PROP_MAXBLOCKSIZE, NULL); } /* * The value must be a power of two between * SPA_MINBLOCKSIZE and maxbs. */ if (intval < SPA_MINBLOCKSIZE || intval > maxbs || !ISP2(intval)) { zfs_nicebytes(maxbs, buf, sizeof (buf)); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' must be power of 2 from 512B " "to %s"), propname, buf); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } break; } case ZFS_PROP_SPECIAL_SMALL_BLOCKS: { int maxbs = SPA_OLD_MAXBLOCKSIZE; char buf[64]; if (zpool_hdl != NULL) { char state[64] = ""; maxbs = zpool_get_prop_int(zpool_hdl, ZPOOL_PROP_MAXBLOCKSIZE, NULL); /* * Issue a warning but do not fail so that * tests for settable properties succeed. */ if (zpool_prop_get_feature(zpool_hdl, "feature@allocation_classes", state, sizeof (state)) != 0 || strcmp(state, ZFS_FEATURE_ACTIVE) != 0) { (void) fprintf(stderr, gettext( "%s: property requires a special " "device in the pool\n"), propname); } } if (intval != 0 && (intval < SPA_MINBLOCKSIZE || intval > maxbs || !ISP2(intval))) { zfs_nicebytes(maxbs, buf, sizeof (buf)); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid '%s=%llu' property: must be zero " "or a power of 2 from 512B to %s"), propname, (unsigned long long)intval, buf); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } break; } case ZFS_PROP_MLSLABEL: { #ifdef HAVE_MLSLABEL /* * Verify the mlslabel string and convert to * internal hex label string. */ m_label_t *new_sl; char *hex = NULL; /* internal label string */ /* Default value is already OK. */ if (strcasecmp(strval, ZFS_MLSLABEL_DEFAULT) == 0) break; /* Verify the label can be converted to binary form */ if (((new_sl = m_label_alloc(MAC_LABEL)) == NULL) || (str_to_label(strval, &new_sl, MAC_LABEL, L_NO_CORRECTION, NULL) == -1)) { goto badlabel; } /* Now translate to hex internal label string */ if (label_to_str(new_sl, &hex, M_INTERNAL, DEF_NAMES) != 0) { if (hex) free(hex); goto badlabel; } m_label_free(new_sl); /* If string is already in internal form, we're done. */ if (strcmp(strval, hex) == 0) { free(hex); break; } /* Replace the label string with the internal form. */ (void) nvlist_remove(ret, zfs_prop_to_name(prop), DATA_TYPE_STRING); fnvlist_add_string(ret, zfs_prop_to_name(prop), hex); free(hex); break; badlabel: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid mlslabel '%s'"), strval); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); m_label_free(new_sl); /* OK if null */ goto error; #else zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "mlslabels are unsupported")); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; #endif /* HAVE_MLSLABEL */ } case ZFS_PROP_MOUNTPOINT: { namecheck_err_t why; if (strcmp(strval, ZFS_MOUNTPOINT_NONE) == 0 || strcmp(strval, ZFS_MOUNTPOINT_LEGACY) == 0) break; if (mountpoint_namecheck(strval, &why)) { switch (why) { case NAME_ERR_LEADING_SLASH: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' must be an absolute path, " "'none', or 'legacy'"), propname); break; case NAME_ERR_TOOLONG: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "component of '%s' is too long"), propname); break; default: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "(%d) not defined"), why); break; } (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } zfs_fallthrough; } case ZFS_PROP_SHARESMB: case ZFS_PROP_SHARENFS: /* * For the mountpoint and sharenfs or sharesmb * properties, check if it can be set in a * global/non-global zone based on * the zoned property value: * * global zone non-global zone * -------------------------------------------------- * zoned=on mountpoint (no) mountpoint (yes) * sharenfs (no) sharenfs (no) * sharesmb (no) sharesmb (no) * * zoned=off mountpoint (yes) N/A * sharenfs (yes) * sharesmb (yes) */ if (zoned) { if (getzoneid() == GLOBAL_ZONEID) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' cannot be set on " "dataset in a non-global zone"), propname); (void) zfs_error(hdl, EZFS_ZONED, errbuf); goto error; } else if (prop == ZFS_PROP_SHARENFS || prop == ZFS_PROP_SHARESMB) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' cannot be set in " "a non-global zone"), propname); (void) zfs_error(hdl, EZFS_ZONED, errbuf); goto error; } } else if (getzoneid() != GLOBAL_ZONEID) { /* * If zoned property is 'off', this must be in * a global zone. If not, something is wrong. */ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' cannot be set while dataset " "'zoned' property is set"), propname); (void) zfs_error(hdl, EZFS_ZONED, errbuf); goto error; } /* * At this point, it is legitimate to set the * property. Now we want to make sure that the * property value is valid if it is sharenfs. */ if ((prop == ZFS_PROP_SHARENFS || prop == ZFS_PROP_SHARESMB) && strcmp(strval, "on") != 0 && strcmp(strval, "off") != 0) { enum sa_protocol proto; if (prop == ZFS_PROP_SHARESMB) proto = SA_PROTOCOL_SMB; else proto = SA_PROTOCOL_NFS; if (sa_validate_shareopts(strval, proto) != SA_OK) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' cannot be set to invalid " "options"), propname); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } } break; case ZFS_PROP_KEYLOCATION: if (!zfs_prop_valid_keylocation(strval, B_FALSE)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid keylocation")); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } if (zhp != NULL) { uint64_t crypt = zfs_prop_get_int(zhp, ZFS_PROP_ENCRYPTION); if (crypt == ZIO_CRYPT_OFF && strcmp(strval, "none") != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "keylocation must be 'none' " "for unencrypted datasets")); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } else if (crypt != ZIO_CRYPT_OFF && strcmp(strval, "none") == 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "keylocation must not be 'none' " "for encrypted datasets")); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } } break; case ZFS_PROP_PBKDF2_ITERS: if (intval < MIN_PBKDF2_ITERATIONS) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "minimum pbkdf2 iterations is %u"), MIN_PBKDF2_ITERATIONS); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } break; case ZFS_PROP_UTF8ONLY: chosen_utf = (int)intval; break; case ZFS_PROP_NORMALIZE: chosen_normal = (int)intval; break; default: break; } /* * For changes to existing volumes, we have some additional * checks to enforce. */ if (type == ZFS_TYPE_VOLUME && zhp != NULL) { uint64_t blocksize = zfs_prop_get_int(zhp, ZFS_PROP_VOLBLOCKSIZE); char buf[64]; switch (prop) { case ZFS_PROP_VOLSIZE: if (intval % blocksize != 0) { zfs_nicebytes(blocksize, buf, sizeof (buf)); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' must be a multiple of " "volume block size (%s)"), propname, buf); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } if (intval == 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' cannot be zero"), propname); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } break; default: break; } } /* check encryption properties */ if (zhp != NULL) { int64_t crypt = zfs_prop_get_int(zhp, ZFS_PROP_ENCRYPTION); switch (prop) { case ZFS_PROP_COPIES: if (crypt != ZIO_CRYPT_OFF && intval > 2) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "encrypted datasets cannot have " "3 copies")); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } break; default: break; } } } /* * If normalization was chosen, but no UTF8 choice was made, * enforce rejection of non-UTF8 names. * * If normalization was chosen, but rejecting non-UTF8 names * was explicitly not chosen, it is an error. * * If utf8only was turned off, but the parent has normalization, * turn off normalization. */ if (chosen_normal > 0 && chosen_utf < 0) { if (nvlist_add_uint64(ret, zfs_prop_to_name(ZFS_PROP_UTF8ONLY), 1) != 0) { (void) no_memory(hdl); goto error; } } else if (chosen_normal > 0 && chosen_utf == 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' must be set 'on' if normalization chosen"), zfs_prop_to_name(ZFS_PROP_UTF8ONLY)); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } else if (chosen_normal < 0 && chosen_utf == 0) { if (nvlist_add_uint64(ret, zfs_prop_to_name(ZFS_PROP_NORMALIZE), 0) != 0) { (void) no_memory(hdl); goto error; } } return (ret); error: nvlist_free(ret); return (NULL); } static int zfs_add_synthetic_resv(zfs_handle_t *zhp, nvlist_t *nvl) { uint64_t old_volsize; uint64_t new_volsize; uint64_t old_reservation; uint64_t new_reservation; zfs_prop_t resv_prop; nvlist_t *props; zpool_handle_t *zph = zpool_handle(zhp); /* * If this is an existing volume, and someone is setting the volsize, * make sure that it matches the reservation, or add it if necessary. */ old_volsize = zfs_prop_get_int(zhp, ZFS_PROP_VOLSIZE); if (zfs_which_resv_prop(zhp, &resv_prop) < 0) return (-1); old_reservation = zfs_prop_get_int(zhp, resv_prop); props = fnvlist_alloc(); fnvlist_add_uint64(props, zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), zfs_prop_get_int(zhp, ZFS_PROP_VOLBLOCKSIZE)); if ((zvol_volsize_to_reservation(zph, old_volsize, props) != old_reservation) || nvlist_exists(nvl, zfs_prop_to_name(resv_prop))) { fnvlist_free(props); return (0); } if (nvlist_lookup_uint64(nvl, zfs_prop_to_name(ZFS_PROP_VOLSIZE), &new_volsize) != 0) { fnvlist_free(props); return (-1); } new_reservation = zvol_volsize_to_reservation(zph, new_volsize, props); fnvlist_free(props); if (nvlist_add_uint64(nvl, zfs_prop_to_name(resv_prop), new_reservation) != 0) { (void) no_memory(zhp->zfs_hdl); return (-1); } return (1); } /* * Helper for 'zfs {set|clone} refreservation=auto'. Must be called after * zfs_valid_proplist(), as it is what sets the UINT64_MAX sentinel value. * Return codes must match zfs_add_synthetic_resv(). */ static int zfs_fix_auto_resv(zfs_handle_t *zhp, nvlist_t *nvl) { uint64_t volsize; uint64_t resvsize; zfs_prop_t prop; nvlist_t *props; if (!ZFS_IS_VOLUME(zhp)) { return (0); } if (zfs_which_resv_prop(zhp, &prop) != 0) { return (-1); } if (prop != ZFS_PROP_REFRESERVATION) { return (0); } if (nvlist_lookup_uint64(nvl, zfs_prop_to_name(prop), &resvsize) != 0) { /* No value being set, so it can't be "auto" */ return (0); } if (resvsize != UINT64_MAX) { /* Being set to a value other than "auto" */ return (0); } props = fnvlist_alloc(); fnvlist_add_uint64(props, zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), zfs_prop_get_int(zhp, ZFS_PROP_VOLBLOCKSIZE)); if (nvlist_lookup_uint64(nvl, zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) != 0) { volsize = zfs_prop_get_int(zhp, ZFS_PROP_VOLSIZE); } resvsize = zvol_volsize_to_reservation(zpool_handle(zhp), volsize, props); fnvlist_free(props); (void) nvlist_remove_all(nvl, zfs_prop_to_name(prop)); if (nvlist_add_uint64(nvl, zfs_prop_to_name(prop), resvsize) != 0) { (void) no_memory(zhp->zfs_hdl); return (-1); } return (1); } static boolean_t zfs_is_namespace_prop(zfs_prop_t prop) { switch (prop) { case ZFS_PROP_ATIME: case ZFS_PROP_RELATIME: case ZFS_PROP_DEVICES: case ZFS_PROP_EXEC: case ZFS_PROP_SETUID: case ZFS_PROP_READONLY: case ZFS_PROP_XATTR: case ZFS_PROP_NBMAND: return (B_TRUE); default: return (B_FALSE); } } /* * Given a property name and value, set the property for the given dataset. */ int zfs_prop_set(zfs_handle_t *zhp, const char *propname, const char *propval) { int ret = -1; char errbuf[ERRBUFLEN]; libzfs_handle_t *hdl = zhp->zfs_hdl; nvlist_t *nvl = NULL; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot set property for '%s'"), zhp->zfs_name); if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0 || nvlist_add_string(nvl, propname, propval) != 0) { (void) no_memory(hdl); goto error; } ret = zfs_prop_set_list(zhp, nvl); error: nvlist_free(nvl); return (ret); } /* * Given an nvlist of property names and values, set the properties for the * given dataset. */ int zfs_prop_set_list(zfs_handle_t *zhp, nvlist_t *props) { zfs_cmd_t zc = {"\0"}; int ret = -1; prop_changelist_t **cls = NULL; int cl_idx; char errbuf[ERRBUFLEN]; libzfs_handle_t *hdl = zhp->zfs_hdl; nvlist_t *nvl; int nvl_len = 0; int added_resv = 0; zfs_prop_t prop = 0; nvpair_t *elem; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot set property for '%s'"), zhp->zfs_name); if ((nvl = zfs_valid_proplist(hdl, zhp->zfs_type, props, zfs_prop_get_int(zhp, ZFS_PROP_ZONED), zhp, zhp->zpool_hdl, B_FALSE, errbuf)) == NULL) goto error; /* * We have to check for any extra properties which need to be added * before computing the length of the nvlist. */ for (elem = nvlist_next_nvpair(nvl, NULL); elem != NULL; elem = nvlist_next_nvpair(nvl, elem)) { if (zfs_name_to_prop(nvpair_name(elem)) == ZFS_PROP_VOLSIZE && (added_resv = zfs_add_synthetic_resv(zhp, nvl)) == -1) { goto error; } } if (added_resv != 1 && (added_resv = zfs_fix_auto_resv(zhp, nvl)) == -1) { goto error; } /* * Check how many properties we're setting and allocate an array to * store changelist pointers for postfix(). */ for (elem = nvlist_next_nvpair(nvl, NULL); elem != NULL; elem = nvlist_next_nvpair(nvl, elem)) nvl_len++; if ((cls = calloc(nvl_len, sizeof (prop_changelist_t *))) == NULL) goto error; cl_idx = 0; for (elem = nvlist_next_nvpair(nvl, NULL); elem != NULL; elem = nvlist_next_nvpair(nvl, elem)) { prop = zfs_name_to_prop(nvpair_name(elem)); assert(cl_idx < nvl_len); /* * We don't want to unmount & remount the dataset when changing * its canmount property to 'on' or 'noauto'. We only use * the changelist logic to unmount when setting canmount=off. */ if (prop != ZFS_PROP_CANMOUNT || (fnvpair_value_uint64(elem) == ZFS_CANMOUNT_OFF && zfs_is_mounted(zhp, NULL))) { cls[cl_idx] = changelist_gather(zhp, prop, 0, 0); if (cls[cl_idx] == NULL) goto error; } if (prop == ZFS_PROP_MOUNTPOINT && changelist_haszonedchild(cls[cl_idx])) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "child dataset with inherited mountpoint is used " "in a non-global zone")); ret = zfs_error(hdl, EZFS_ZONED, errbuf); goto error; } if (cls[cl_idx] != NULL && (ret = changelist_prefix(cls[cl_idx])) != 0) goto error; cl_idx++; } assert(cl_idx == nvl_len); /* * Execute the corresponding ioctl() to set this list of properties. */ (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); zcmd_write_src_nvlist(hdl, &zc, nvl); zcmd_alloc_dst_nvlist(hdl, &zc, 0); ret = zfs_ioctl(hdl, ZFS_IOC_SET_PROP, &zc); if (ret != 0) { if (zc.zc_nvlist_dst_filled == B_FALSE) { (void) zfs_standard_error(hdl, errno, errbuf); goto error; } /* Get the list of unset properties back and report them. */ nvlist_t *errorprops = NULL; if (zcmd_read_dst_nvlist(hdl, &zc, &errorprops) != 0) goto error; for (nvpair_t *elem = nvlist_next_nvpair(errorprops, NULL); elem != NULL; elem = nvlist_next_nvpair(errorprops, elem)) { prop = zfs_name_to_prop(nvpair_name(elem)); zfs_setprop_error(hdl, prop, errno, errbuf); } nvlist_free(errorprops); if (added_resv && errno == ENOSPC) { /* clean up the volsize property we tried to set */ uint64_t old_volsize = zfs_prop_get_int(zhp, ZFS_PROP_VOLSIZE); nvlist_free(nvl); nvl = NULL; zcmd_free_nvlists(&zc); if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0) goto error; if (nvlist_add_uint64(nvl, zfs_prop_to_name(ZFS_PROP_VOLSIZE), old_volsize) != 0) goto error; zcmd_write_src_nvlist(hdl, &zc, nvl); (void) zfs_ioctl(hdl, ZFS_IOC_SET_PROP, &zc); } } else { for (cl_idx = 0; cl_idx < nvl_len; cl_idx++) { if (cls[cl_idx] != NULL) { int clp_err = changelist_postfix(cls[cl_idx]); if (clp_err != 0) ret = clp_err; } } if (ret == 0) { /* * Refresh the statistics so the new property * value is reflected. */ (void) get_stats(zhp); /* * Remount the filesystem to propagate the change * if one of the options handled by the generic * Linux namespace layer has been modified. */ if (zfs_is_namespace_prop(prop) && zfs_is_mounted(zhp, NULL)) ret = zfs_mount(zhp, MNTOPT_REMOUNT, 0); } } error: nvlist_free(nvl); zcmd_free_nvlists(&zc); if (cls != NULL) { for (cl_idx = 0; cl_idx < nvl_len; cl_idx++) { if (cls[cl_idx] != NULL) changelist_free(cls[cl_idx]); } free(cls); } return (ret); } /* * Given a property, inherit the value from the parent dataset, or if received * is TRUE, revert to the received value, if any. */ int zfs_prop_inherit(zfs_handle_t *zhp, const char *propname, boolean_t received) { zfs_cmd_t zc = {"\0"}; int ret; prop_changelist_t *cl; libzfs_handle_t *hdl = zhp->zfs_hdl; char errbuf[ERRBUFLEN]; zfs_prop_t prop; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot inherit %s for '%s'"), propname, zhp->zfs_name); zc.zc_cookie = received; if ((prop = zfs_name_to_prop(propname)) == ZPROP_USERPROP) { /* * For user properties, the amount of work we have to do is very * small, so just do it here. */ if (!zfs_prop_user(propname)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid property")); return (zfs_error(hdl, EZFS_BADPROP, errbuf)); } (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); (void) strlcpy(zc.zc_value, propname, sizeof (zc.zc_value)); if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_INHERIT_PROP, &zc) != 0) return (zfs_standard_error(hdl, errno, errbuf)); (void) get_stats(zhp); return (0); } /* * Verify that this property is inheritable. */ if (zfs_prop_readonly(prop)) return (zfs_error(hdl, EZFS_PROPREADONLY, errbuf)); if (!zfs_prop_inheritable(prop) && !received) return (zfs_error(hdl, EZFS_PROPNONINHERIT, errbuf)); /* * Check to see if the value applies to this type */ if (!zfs_prop_valid_for_type(prop, zhp->zfs_type, B_FALSE)) return (zfs_error(hdl, EZFS_PROPTYPE, errbuf)); /* * Normalize the name, to get rid of shorthand abbreviations. */ propname = zfs_prop_to_name(prop); (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); (void) strlcpy(zc.zc_value, propname, sizeof (zc.zc_value)); if (prop == ZFS_PROP_MOUNTPOINT && getzoneid() == GLOBAL_ZONEID && zfs_prop_get_int(zhp, ZFS_PROP_ZONED)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "dataset is used in a non-global zone")); return (zfs_error(hdl, EZFS_ZONED, errbuf)); } /* * Determine datasets which will be affected by this change, if any. */ if ((cl = changelist_gather(zhp, prop, 0, 0)) == NULL) return (-1); if (prop == ZFS_PROP_MOUNTPOINT && changelist_haszonedchild(cl)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "child dataset with inherited mountpoint is used " "in a non-global zone")); ret = zfs_error(hdl, EZFS_ZONED, errbuf); goto error; } if ((ret = changelist_prefix(cl)) != 0) goto error; - if ((ret = zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_INHERIT_PROP, &zc)) != 0) { + if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_INHERIT_PROP, &zc) != 0) { changelist_free(cl); return (zfs_standard_error(hdl, errno, errbuf)); } else { if ((ret = changelist_postfix(cl)) != 0) goto error; /* * Refresh the statistics so the new property is reflected. */ (void) get_stats(zhp); /* * Remount the filesystem to propagate the change * if one of the options handled by the generic * Linux namespace layer has been modified. */ if (zfs_is_namespace_prop(prop) && zfs_is_mounted(zhp, NULL)) ret = zfs_mount(zhp, MNTOPT_REMOUNT, 0); } error: changelist_free(cl); return (ret); } /* * True DSL properties are stored in an nvlist. The following two functions * extract them appropriately. */ uint64_t getprop_uint64(zfs_handle_t *zhp, zfs_prop_t prop, char **source) { nvlist_t *nv; uint64_t value; *source = NULL; if (nvlist_lookup_nvlist(zhp->zfs_props, zfs_prop_to_name(prop), &nv) == 0) { value = fnvlist_lookup_uint64(nv, ZPROP_VALUE); (void) nvlist_lookup_string(nv, ZPROP_SOURCE, source); } else { verify(!zhp->zfs_props_table || zhp->zfs_props_table[prop] == B_TRUE); value = zfs_prop_default_numeric(prop); *source = (char *)""; } return (value); } static const char * getprop_string(zfs_handle_t *zhp, zfs_prop_t prop, char **source) { nvlist_t *nv; const char *value; *source = NULL; if (nvlist_lookup_nvlist(zhp->zfs_props, zfs_prop_to_name(prop), &nv) == 0) { value = fnvlist_lookup_string(nv, ZPROP_VALUE); (void) nvlist_lookup_string(nv, ZPROP_SOURCE, source); } else { verify(!zhp->zfs_props_table || zhp->zfs_props_table[prop] == B_TRUE); value = zfs_prop_default_string(prop); *source = (char *)""; } return (value); } static boolean_t zfs_is_recvd_props_mode(zfs_handle_t *zhp) { return (zhp->zfs_props == zhp->zfs_recvd_props); } static void zfs_set_recvd_props_mode(zfs_handle_t *zhp, uint64_t *cookie) { *cookie = (uint64_t)(uintptr_t)zhp->zfs_props; zhp->zfs_props = zhp->zfs_recvd_props; } static void zfs_unset_recvd_props_mode(zfs_handle_t *zhp, uint64_t *cookie) { zhp->zfs_props = (nvlist_t *)(uintptr_t)*cookie; *cookie = 0; } /* * Internal function for getting a numeric property. Both zfs_prop_get() and * zfs_prop_get_int() are built using this interface. * * Certain properties can be overridden using 'mount -o'. In this case, scan * the contents of the /proc/self/mounts entry, searching for the * appropriate options. If they differ from the on-disk values, report the * current values and mark the source "temporary". */ static int get_numeric_property(zfs_handle_t *zhp, zfs_prop_t prop, zprop_source_t *src, char **source, uint64_t *val) { zfs_cmd_t zc = {"\0"}; nvlist_t *zplprops = NULL; struct mnttab mnt; const char *mntopt_on = NULL; const char *mntopt_off = NULL; boolean_t received = zfs_is_recvd_props_mode(zhp); *source = NULL; /* * If the property is being fetched for a snapshot, check whether * the property is valid for the snapshot's head dataset type. */ if (zhp->zfs_type == ZFS_TYPE_SNAPSHOT && !zfs_prop_valid_for_type(prop, zhp->zfs_head_type, B_TRUE)) { *val = zfs_prop_default_numeric(prop); return (-1); } switch (prop) { case ZFS_PROP_ATIME: mntopt_on = MNTOPT_ATIME; mntopt_off = MNTOPT_NOATIME; break; case ZFS_PROP_RELATIME: mntopt_on = MNTOPT_RELATIME; mntopt_off = MNTOPT_NORELATIME; break; case ZFS_PROP_DEVICES: mntopt_on = MNTOPT_DEVICES; mntopt_off = MNTOPT_NODEVICES; break; case ZFS_PROP_EXEC: mntopt_on = MNTOPT_EXEC; mntopt_off = MNTOPT_NOEXEC; break; case ZFS_PROP_READONLY: mntopt_on = MNTOPT_RO; mntopt_off = MNTOPT_RW; break; case ZFS_PROP_SETUID: mntopt_on = MNTOPT_SETUID; mntopt_off = MNTOPT_NOSETUID; break; case ZFS_PROP_XATTR: mntopt_on = MNTOPT_XATTR; mntopt_off = MNTOPT_NOXATTR; break; case ZFS_PROP_NBMAND: mntopt_on = MNTOPT_NBMAND; mntopt_off = MNTOPT_NONBMAND; break; default: break; } /* * Because looking up the mount options is potentially expensive * (iterating over all of /proc/self/mounts), we defer its * calculation until we're looking up a property which requires * its presence. */ if (!zhp->zfs_mntcheck && (mntopt_on != NULL || prop == ZFS_PROP_MOUNTED)) { libzfs_handle_t *hdl = zhp->zfs_hdl; struct mnttab entry; if (libzfs_mnttab_find(hdl, zhp->zfs_name, &entry) == 0) zhp->zfs_mntopts = zfs_strdup(hdl, entry.mnt_mntopts); zhp->zfs_mntcheck = B_TRUE; } if (zhp->zfs_mntopts == NULL) mnt.mnt_mntopts = (char *)""; else mnt.mnt_mntopts = zhp->zfs_mntopts; switch (prop) { case ZFS_PROP_ATIME: case ZFS_PROP_RELATIME: case ZFS_PROP_DEVICES: case ZFS_PROP_EXEC: case ZFS_PROP_READONLY: case ZFS_PROP_SETUID: #ifndef __FreeBSD__ case ZFS_PROP_XATTR: #endif case ZFS_PROP_NBMAND: *val = getprop_uint64(zhp, prop, source); if (received) break; if (hasmntopt(&mnt, mntopt_on) && !*val) { *val = B_TRUE; if (src) *src = ZPROP_SRC_TEMPORARY; } else if (hasmntopt(&mnt, mntopt_off) && *val) { *val = B_FALSE; if (src) *src = ZPROP_SRC_TEMPORARY; } break; case ZFS_PROP_CANMOUNT: case ZFS_PROP_VOLSIZE: case ZFS_PROP_QUOTA: case ZFS_PROP_REFQUOTA: case ZFS_PROP_RESERVATION: case ZFS_PROP_REFRESERVATION: case ZFS_PROP_FILESYSTEM_LIMIT: case ZFS_PROP_SNAPSHOT_LIMIT: case ZFS_PROP_FILESYSTEM_COUNT: case ZFS_PROP_SNAPSHOT_COUNT: *val = getprop_uint64(zhp, prop, source); if (*source == NULL) { /* not default, must be local */ *source = zhp->zfs_name; } break; case ZFS_PROP_MOUNTED: *val = (zhp->zfs_mntopts != NULL); break; case ZFS_PROP_NUMCLONES: *val = zhp->zfs_dmustats.dds_num_clones; break; case ZFS_PROP_VERSION: case ZFS_PROP_NORMALIZE: case ZFS_PROP_UTF8ONLY: case ZFS_PROP_CASE: zcmd_alloc_dst_nvlist(zhp->zfs_hdl, &zc, 0); (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_OBJSET_ZPLPROPS, &zc)) { zcmd_free_nvlists(&zc); if (prop == ZFS_PROP_VERSION && zhp->zfs_type == ZFS_TYPE_VOLUME) *val = zfs_prop_default_numeric(prop); return (-1); } if (zcmd_read_dst_nvlist(zhp->zfs_hdl, &zc, &zplprops) != 0 || nvlist_lookup_uint64(zplprops, zfs_prop_to_name(prop), val) != 0) { zcmd_free_nvlists(&zc); return (-1); } nvlist_free(zplprops); zcmd_free_nvlists(&zc); break; case ZFS_PROP_INCONSISTENT: *val = zhp->zfs_dmustats.dds_inconsistent; break; case ZFS_PROP_REDACTED: *val = zhp->zfs_dmustats.dds_redacted; break; case ZFS_PROP_CREATETXG: /* * We can directly read createtxg property from zfs * handle for Filesystem, Snapshot and ZVOL types. */ if ((zhp->zfs_type == ZFS_TYPE_FILESYSTEM) || (zhp->zfs_type == ZFS_TYPE_SNAPSHOT) || (zhp->zfs_type == ZFS_TYPE_VOLUME)) { *val = zhp->zfs_dmustats.dds_creation_txg; break; } zfs_fallthrough; default: switch (zfs_prop_get_type(prop)) { case PROP_TYPE_NUMBER: case PROP_TYPE_INDEX: *val = getprop_uint64(zhp, prop, source); /* * If we tried to use a default value for a * readonly property, it means that it was not * present. Note this only applies to "truly" * readonly properties, not set-once properties * like volblocksize. */ if (zfs_prop_readonly(prop) && !zfs_prop_setonce(prop) && *source != NULL && (*source)[0] == '\0') { *source = NULL; return (-1); } break; case PROP_TYPE_STRING: default: zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, "cannot get non-numeric property")); return (zfs_error(zhp->zfs_hdl, EZFS_BADPROP, dgettext(TEXT_DOMAIN, "internal error"))); } } return (0); } /* * Calculate the source type, given the raw source string. */ static void get_source(zfs_handle_t *zhp, zprop_source_t *srctype, char *source, char *statbuf, size_t statlen) { if (statbuf == NULL || srctype == NULL || *srctype == ZPROP_SRC_TEMPORARY) { return; } if (source == NULL) { *srctype = ZPROP_SRC_NONE; } else if (source[0] == '\0') { *srctype = ZPROP_SRC_DEFAULT; } else if (strstr(source, ZPROP_SOURCE_VAL_RECVD) != NULL) { *srctype = ZPROP_SRC_RECEIVED; } else { if (strcmp(source, zhp->zfs_name) == 0) { *srctype = ZPROP_SRC_LOCAL; } else { (void) strlcpy(statbuf, source, statlen); *srctype = ZPROP_SRC_INHERITED; } } } int zfs_prop_get_recvd(zfs_handle_t *zhp, const char *propname, char *propbuf, size_t proplen, boolean_t literal) { zfs_prop_t prop; int err = 0; if (zhp->zfs_recvd_props == NULL) if (get_recvd_props_ioctl(zhp) != 0) return (-1); prop = zfs_name_to_prop(propname); if (prop != ZPROP_USERPROP) { uint64_t cookie; if (!nvlist_exists(zhp->zfs_recvd_props, propname)) return (-1); zfs_set_recvd_props_mode(zhp, &cookie); err = zfs_prop_get(zhp, prop, propbuf, proplen, NULL, NULL, 0, literal); zfs_unset_recvd_props_mode(zhp, &cookie); } else { nvlist_t *propval; char *recvdval; if (nvlist_lookup_nvlist(zhp->zfs_recvd_props, propname, &propval) != 0) return (-1); recvdval = fnvlist_lookup_string(propval, ZPROP_VALUE); (void) strlcpy(propbuf, recvdval, proplen); } return (err == 0 ? 0 : -1); } static int get_clones_string(zfs_handle_t *zhp, char *propbuf, size_t proplen) { nvlist_t *value; nvpair_t *pair; value = zfs_get_clones_nvl(zhp); if (value == NULL || nvlist_empty(value)) return (-1); propbuf[0] = '\0'; for (pair = nvlist_next_nvpair(value, NULL); pair != NULL; pair = nvlist_next_nvpair(value, pair)) { if (propbuf[0] != '\0') (void) strlcat(propbuf, ",", proplen); (void) strlcat(propbuf, nvpair_name(pair), proplen); } return (0); } struct get_clones_arg { uint64_t numclones; nvlist_t *value; const char *origin; char buf[ZFS_MAX_DATASET_NAME_LEN]; }; static int get_clones_cb(zfs_handle_t *zhp, void *arg) { struct get_clones_arg *gca = arg; if (gca->numclones == 0) { zfs_close(zhp); return (0); } if (zfs_prop_get(zhp, ZFS_PROP_ORIGIN, gca->buf, sizeof (gca->buf), NULL, NULL, 0, B_TRUE) != 0) goto out; if (strcmp(gca->buf, gca->origin) == 0) { fnvlist_add_boolean(gca->value, zfs_get_name(zhp)); gca->numclones--; } out: (void) zfs_iter_children(zhp, get_clones_cb, gca); zfs_close(zhp); return (0); } nvlist_t * zfs_get_clones_nvl(zfs_handle_t *zhp) { nvlist_t *nv, *value; if (nvlist_lookup_nvlist(zhp->zfs_props, zfs_prop_to_name(ZFS_PROP_CLONES), &nv) != 0) { struct get_clones_arg gca; /* * if this is a snapshot, then the kernel wasn't able * to get the clones. Do it by slowly iterating. */ if (zhp->zfs_type != ZFS_TYPE_SNAPSHOT) return (NULL); if (nvlist_alloc(&nv, NV_UNIQUE_NAME, 0) != 0) return (NULL); if (nvlist_alloc(&value, NV_UNIQUE_NAME, 0) != 0) { nvlist_free(nv); return (NULL); } gca.numclones = zfs_prop_get_int(zhp, ZFS_PROP_NUMCLONES); gca.value = value; gca.origin = zhp->zfs_name; if (gca.numclones != 0) { zfs_handle_t *root; char pool[ZFS_MAX_DATASET_NAME_LEN]; char *cp = pool; /* get the pool name */ (void) strlcpy(pool, zhp->zfs_name, sizeof (pool)); (void) strsep(&cp, "/@"); root = zfs_open(zhp->zfs_hdl, pool, ZFS_TYPE_FILESYSTEM); if (root == NULL) { nvlist_free(nv); nvlist_free(value); return (NULL); } (void) get_clones_cb(root, &gca); } if (gca.numclones != 0 || nvlist_add_nvlist(nv, ZPROP_VALUE, value) != 0 || nvlist_add_nvlist(zhp->zfs_props, zfs_prop_to_name(ZFS_PROP_CLONES), nv) != 0) { nvlist_free(nv); nvlist_free(value); return (NULL); } nvlist_free(nv); nvlist_free(value); nv = fnvlist_lookup_nvlist(zhp->zfs_props, zfs_prop_to_name(ZFS_PROP_CLONES)); } return (fnvlist_lookup_nvlist(nv, ZPROP_VALUE)); } static int get_rsnaps_string(zfs_handle_t *zhp, char *propbuf, size_t proplen) { nvlist_t *value; uint64_t *snaps; uint_t nsnaps; if (nvlist_lookup_nvlist(zhp->zfs_props, zfs_prop_to_name(ZFS_PROP_REDACT_SNAPS), &value) != 0) return (-1); if (nvlist_lookup_uint64_array(value, ZPROP_VALUE, &snaps, &nsnaps) != 0) return (-1); if (nsnaps == 0) { /* There's no redaction snapshots; pass a special value back */ (void) snprintf(propbuf, proplen, "none"); return (0); } propbuf[0] = '\0'; for (int i = 0; i < nsnaps; i++) { char buf[128]; if (propbuf[0] != '\0') (void) strlcat(propbuf, ",", proplen); (void) snprintf(buf, sizeof (buf), "%llu", (u_longlong_t)snaps[i]); (void) strlcat(propbuf, buf, proplen); } return (0); } /* * Accepts a property and value and checks that the value * matches the one found by the channel program. If they are * not equal, print both of them. */ static void zcp_check(zfs_handle_t *zhp, zfs_prop_t prop, uint64_t intval, const char *strval) { if (!zhp->zfs_hdl->libzfs_prop_debug) return; int error; char *poolname = zhp->zpool_hdl->zpool_name; const char *prop_name = zfs_prop_to_name(prop); const char *program = "args = ...\n" "ds = args['dataset']\n" "prop = args['property']\n" "value, setpoint = zfs.get_prop(ds, prop)\n" "return {value=value, setpoint=setpoint}\n"; nvlist_t *outnvl; nvlist_t *retnvl; nvlist_t *argnvl = fnvlist_alloc(); fnvlist_add_string(argnvl, "dataset", zhp->zfs_name); fnvlist_add_string(argnvl, "property", zfs_prop_to_name(prop)); error = lzc_channel_program_nosync(poolname, program, 10 * 1000 * 1000, 10 * 1024 * 1024, argnvl, &outnvl); if (error == 0) { retnvl = fnvlist_lookup_nvlist(outnvl, "return"); if (zfs_prop_get_type(prop) == PROP_TYPE_NUMBER) { int64_t ans; error = nvlist_lookup_int64(retnvl, "value", &ans); if (error != 0) { (void) fprintf(stderr, "%s: zcp check error: " "%u\n", prop_name, error); return; } if (ans != intval) { (void) fprintf(stderr, "%s: zfs found %llu, " "but zcp found %llu\n", prop_name, (u_longlong_t)intval, (u_longlong_t)ans); } } else { char *str_ans; error = nvlist_lookup_string(retnvl, "value", &str_ans); if (error != 0) { (void) fprintf(stderr, "%s: zcp check error: " "%u\n", prop_name, error); return; } if (strcmp(strval, str_ans) != 0) { (void) fprintf(stderr, "%s: zfs found '%s', but zcp found '%s'\n", prop_name, strval, str_ans); } } } else { (void) fprintf(stderr, "%s: zcp check failed, channel program " "error: %u\n", prop_name, error); } nvlist_free(argnvl); nvlist_free(outnvl); } /* * Retrieve a property from the given object. If 'literal' is specified, then * numbers are left as exact values. Otherwise, numbers are converted to a * human-readable form. * * Returns 0 on success, or -1 on error. */ int zfs_prop_get(zfs_handle_t *zhp, zfs_prop_t prop, char *propbuf, size_t proplen, zprop_source_t *src, char *statbuf, size_t statlen, boolean_t literal) { char *source = NULL; uint64_t val; const char *str; const char *strval; boolean_t received = zfs_is_recvd_props_mode(zhp); /* * Check to see if this property applies to our object */ if (!zfs_prop_valid_for_type(prop, zhp->zfs_type, B_FALSE)) return (-1); if (received && zfs_prop_readonly(prop)) return (-1); if (src) *src = ZPROP_SRC_NONE; switch (prop) { case ZFS_PROP_CREATION: /* * 'creation' is a time_t stored in the statistics. We convert * this into a string unless 'literal' is specified. */ { val = getprop_uint64(zhp, prop, &source); time_t time = (time_t)val; struct tm t; if (literal || localtime_r(&time, &t) == NULL || strftime(propbuf, proplen, "%a %b %e %k:%M %Y", &t) == 0) (void) snprintf(propbuf, proplen, "%llu", (u_longlong_t)val); } zcp_check(zhp, prop, val, NULL); break; case ZFS_PROP_MOUNTPOINT: /* * Getting the precise mountpoint can be tricky. * * - for 'none' or 'legacy', return those values. * - for inherited mountpoints, we want to take everything * after our ancestor and append it to the inherited value. * * If the pool has an alternate root, we want to prepend that * root to any values we return. */ str = getprop_string(zhp, prop, &source); if (str[0] == '/') { char buf[MAXPATHLEN]; char *root = buf; const char *relpath; /* * If we inherit the mountpoint, even from a dataset * with a received value, the source will be the path of * the dataset we inherit from. If source is * ZPROP_SOURCE_VAL_RECVD, the received value is not * inherited. */ if (strcmp(source, ZPROP_SOURCE_VAL_RECVD) == 0) { relpath = ""; } else { relpath = zhp->zfs_name + strlen(source); if (relpath[0] == '/') relpath++; } if ((zpool_get_prop(zhp->zpool_hdl, ZPOOL_PROP_ALTROOT, buf, MAXPATHLEN, NULL, B_FALSE)) || (strcmp(root, "-") == 0)) root[0] = '\0'; /* * Special case an alternate root of '/'. This will * avoid having multiple leading slashes in the * mountpoint path. */ if (strcmp(root, "/") == 0) root++; /* * If the mountpoint is '/' then skip over this * if we are obtaining either an alternate root or * an inherited mountpoint. */ if (str[1] == '\0' && (root[0] != '\0' || relpath[0] != '\0')) str++; if (relpath[0] == '\0') (void) snprintf(propbuf, proplen, "%s%s", root, str); else (void) snprintf(propbuf, proplen, "%s%s%s%s", root, str, relpath[0] == '@' ? "" : "/", relpath); } else { /* 'legacy' or 'none' */ (void) strlcpy(propbuf, str, proplen); } zcp_check(zhp, prop, 0, propbuf); break; case ZFS_PROP_ORIGIN: str = getprop_string(zhp, prop, &source); if (str == NULL) return (-1); (void) strlcpy(propbuf, str, proplen); zcp_check(zhp, prop, 0, str); break; case ZFS_PROP_REDACT_SNAPS: if (get_rsnaps_string(zhp, propbuf, proplen) != 0) return (-1); break; case ZFS_PROP_CLONES: if (get_clones_string(zhp, propbuf, proplen) != 0) return (-1); break; case ZFS_PROP_QUOTA: case ZFS_PROP_REFQUOTA: case ZFS_PROP_RESERVATION: case ZFS_PROP_REFRESERVATION: if (get_numeric_property(zhp, prop, src, &source, &val) != 0) return (-1); /* * If quota or reservation is 0, we translate this into 'none' * (unless literal is set), and indicate that it's the default * value. Otherwise, we print the number nicely and indicate * that its set locally. */ if (val == 0) { if (literal) (void) strlcpy(propbuf, "0", proplen); else (void) strlcpy(propbuf, "none", proplen); } else { if (literal) (void) snprintf(propbuf, proplen, "%llu", (u_longlong_t)val); else zfs_nicebytes(val, propbuf, proplen); } zcp_check(zhp, prop, val, NULL); break; case ZFS_PROP_FILESYSTEM_LIMIT: case ZFS_PROP_SNAPSHOT_LIMIT: case ZFS_PROP_FILESYSTEM_COUNT: case ZFS_PROP_SNAPSHOT_COUNT: if (get_numeric_property(zhp, prop, src, &source, &val) != 0) return (-1); /* * If limit is UINT64_MAX, we translate this into 'none', and * indicate that it's the default value. Otherwise, we print * the number nicely and indicate that it's set locally. */ if (val == UINT64_MAX) { (void) strlcpy(propbuf, "none", proplen); } else if (literal) { (void) snprintf(propbuf, proplen, "%llu", (u_longlong_t)val); } else { zfs_nicenum(val, propbuf, proplen); } zcp_check(zhp, prop, val, NULL); break; case ZFS_PROP_REFRATIO: case ZFS_PROP_COMPRESSRATIO: if (get_numeric_property(zhp, prop, src, &source, &val) != 0) return (-1); if (literal) (void) snprintf(propbuf, proplen, "%llu.%02llu", (u_longlong_t)(val / 100), (u_longlong_t)(val % 100)); else (void) snprintf(propbuf, proplen, "%llu.%02llux", (u_longlong_t)(val / 100), (u_longlong_t)(val % 100)); zcp_check(zhp, prop, val, NULL); break; case ZFS_PROP_TYPE: switch (zhp->zfs_type) { case ZFS_TYPE_FILESYSTEM: str = "filesystem"; break; case ZFS_TYPE_VOLUME: str = "volume"; break; case ZFS_TYPE_SNAPSHOT: str = "snapshot"; break; case ZFS_TYPE_BOOKMARK: str = "bookmark"; break; default: abort(); } (void) snprintf(propbuf, proplen, "%s", str); zcp_check(zhp, prop, 0, propbuf); break; case ZFS_PROP_MOUNTED: /* * The 'mounted' property is a pseudo-property that described * whether the filesystem is currently mounted. Even though * it's a boolean value, the typical values of "on" and "off" * don't make sense, so we translate to "yes" and "no". */ if (get_numeric_property(zhp, ZFS_PROP_MOUNTED, src, &source, &val) != 0) return (-1); if (val) (void) strlcpy(propbuf, "yes", proplen); else (void) strlcpy(propbuf, "no", proplen); break; case ZFS_PROP_NAME: /* * The 'name' property is a pseudo-property derived from the * dataset name. It is presented as a real property to simplify * consumers. */ (void) strlcpy(propbuf, zhp->zfs_name, proplen); zcp_check(zhp, prop, 0, propbuf); break; case ZFS_PROP_MLSLABEL: { #ifdef HAVE_MLSLABEL m_label_t *new_sl = NULL; char *ascii = NULL; /* human readable label */ (void) strlcpy(propbuf, getprop_string(zhp, prop, &source), proplen); if (literal || (strcasecmp(propbuf, ZFS_MLSLABEL_DEFAULT) == 0)) break; /* * Try to translate the internal hex string to * human-readable output. If there are any * problems just use the hex string. */ if (str_to_label(propbuf, &new_sl, MAC_LABEL, L_NO_CORRECTION, NULL) == -1) { m_label_free(new_sl); break; } if (label_to_str(new_sl, &ascii, M_LABEL, DEF_NAMES) != 0) { if (ascii) free(ascii); m_label_free(new_sl); break; } m_label_free(new_sl); (void) strlcpy(propbuf, ascii, proplen); free(ascii); #else (void) strlcpy(propbuf, getprop_string(zhp, prop, &source), proplen); #endif /* HAVE_MLSLABEL */ } break; case ZFS_PROP_GUID: case ZFS_PROP_KEY_GUID: case ZFS_PROP_IVSET_GUID: case ZFS_PROP_CREATETXG: case ZFS_PROP_OBJSETID: case ZFS_PROP_PBKDF2_ITERS: /* * These properties are stored as numbers, but they are * identifiers or counters. * We don't want them to be pretty printed, because pretty * printing truncates their values making them useless. */ if (get_numeric_property(zhp, prop, src, &source, &val) != 0) return (-1); (void) snprintf(propbuf, proplen, "%llu", (u_longlong_t)val); zcp_check(zhp, prop, val, NULL); break; case ZFS_PROP_REFERENCED: case ZFS_PROP_AVAILABLE: case ZFS_PROP_USED: case ZFS_PROP_USEDSNAP: case ZFS_PROP_USEDDS: case ZFS_PROP_USEDREFRESERV: case ZFS_PROP_USEDCHILD: if (get_numeric_property(zhp, prop, src, &source, &val) != 0) return (-1); if (literal) { (void) snprintf(propbuf, proplen, "%llu", (u_longlong_t)val); } else { zfs_nicebytes(val, propbuf, proplen); } zcp_check(zhp, prop, val, NULL); break; case ZFS_PROP_SNAPSHOTS_CHANGED: { if ((get_numeric_property(zhp, prop, src, &source, &val) != 0) || val == 0) { return (-1); } time_t time = (time_t)val; struct tm t; if (literal || localtime_r(&time, &t) == NULL || strftime(propbuf, proplen, "%a %b %e %k:%M:%S %Y", &t) == 0) (void) snprintf(propbuf, proplen, "%llu", (u_longlong_t)val); } zcp_check(zhp, prop, val, NULL); break; default: switch (zfs_prop_get_type(prop)) { case PROP_TYPE_NUMBER: if (get_numeric_property(zhp, prop, src, &source, &val) != 0) { return (-1); } if (literal) { (void) snprintf(propbuf, proplen, "%llu", (u_longlong_t)val); } else { zfs_nicenum(val, propbuf, proplen); } zcp_check(zhp, prop, val, NULL); break; case PROP_TYPE_STRING: str = getprop_string(zhp, prop, &source); if (str == NULL) return (-1); (void) strlcpy(propbuf, str, proplen); zcp_check(zhp, prop, 0, str); break; case PROP_TYPE_INDEX: if (get_numeric_property(zhp, prop, src, &source, &val) != 0) return (-1); if (zfs_prop_index_to_string(prop, val, &strval) != 0) return (-1); (void) strlcpy(propbuf, strval, proplen); zcp_check(zhp, prop, 0, strval); break; default: abort(); } } get_source(zhp, src, source, statbuf, statlen); return (0); } /* * Utility function to get the given numeric property. Does no validation that * the given property is the appropriate type; should only be used with * hard-coded property types. */ uint64_t zfs_prop_get_int(zfs_handle_t *zhp, zfs_prop_t prop) { char *source; uint64_t val = 0; (void) get_numeric_property(zhp, prop, NULL, &source, &val); return (val); } static int zfs_prop_set_int(zfs_handle_t *zhp, zfs_prop_t prop, uint64_t val) { char buf[64]; (void) snprintf(buf, sizeof (buf), "%llu", (longlong_t)val); return (zfs_prop_set(zhp, zfs_prop_to_name(prop), buf)); } /* * Similar to zfs_prop_get(), but returns the value as an integer. */ int zfs_prop_get_numeric(zfs_handle_t *zhp, zfs_prop_t prop, uint64_t *value, zprop_source_t *src, char *statbuf, size_t statlen) { char *source; /* * Check to see if this property applies to our object */ if (!zfs_prop_valid_for_type(prop, zhp->zfs_type, B_FALSE)) { return (zfs_error_fmt(zhp->zfs_hdl, EZFS_PROPTYPE, dgettext(TEXT_DOMAIN, "cannot get property '%s'"), zfs_prop_to_name(prop))); } if (src) *src = ZPROP_SRC_NONE; if (get_numeric_property(zhp, prop, src, &source, value) != 0) return (-1); get_source(zhp, src, source, statbuf, statlen); return (0); } #ifdef HAVE_IDMAP static int idmap_id_to_numeric_domain_rid(uid_t id, boolean_t isuser, char **domainp, idmap_rid_t *ridp) { idmap_get_handle_t *get_hdl = NULL; idmap_stat status; int err = EINVAL; if (idmap_get_create(&get_hdl) != IDMAP_SUCCESS) goto out; if (isuser) { err = idmap_get_sidbyuid(get_hdl, id, IDMAP_REQ_FLG_USE_CACHE, domainp, ridp, &status); } else { err = idmap_get_sidbygid(get_hdl, id, IDMAP_REQ_FLG_USE_CACHE, domainp, ridp, &status); } if (err == IDMAP_SUCCESS && idmap_get_mappings(get_hdl) == IDMAP_SUCCESS && status == IDMAP_SUCCESS) err = 0; else err = EINVAL; out: if (get_hdl) idmap_get_destroy(get_hdl); return (err); } #endif /* HAVE_IDMAP */ /* * convert the propname into parameters needed by kernel * Eg: userquota@ahrens -> ZFS_PROP_USERQUOTA, "", 126829 * Eg: userused@matt@domain -> ZFS_PROP_USERUSED, "S-1-123-456", 789 * Eg: groupquota@staff -> ZFS_PROP_GROUPQUOTA, "", 1234 * Eg: groupused@staff -> ZFS_PROP_GROUPUSED, "", 1234 * Eg: projectquota@123 -> ZFS_PROP_PROJECTQUOTA, "", 123 * Eg: projectused@789 -> ZFS_PROP_PROJECTUSED, "", 789 */ static int userquota_propname_decode(const char *propname, boolean_t zoned, zfs_userquota_prop_t *typep, char *domain, int domainlen, uint64_t *ridp) { zfs_userquota_prop_t type; char *cp; boolean_t isuser; boolean_t isgroup; boolean_t isproject; struct passwd *pw; struct group *gr; domain[0] = '\0'; /* Figure out the property type ({user|group|project}{quota|space}) */ for (type = 0; type < ZFS_NUM_USERQUOTA_PROPS; type++) { if (strncmp(propname, zfs_userquota_prop_prefixes[type], strlen(zfs_userquota_prop_prefixes[type])) == 0) break; } if (type == ZFS_NUM_USERQUOTA_PROPS) return (EINVAL); *typep = type; isuser = (type == ZFS_PROP_USERQUOTA || type == ZFS_PROP_USERUSED || type == ZFS_PROP_USEROBJQUOTA || type == ZFS_PROP_USEROBJUSED); isgroup = (type == ZFS_PROP_GROUPQUOTA || type == ZFS_PROP_GROUPUSED || type == ZFS_PROP_GROUPOBJQUOTA || type == ZFS_PROP_GROUPOBJUSED); isproject = (type == ZFS_PROP_PROJECTQUOTA || type == ZFS_PROP_PROJECTUSED || type == ZFS_PROP_PROJECTOBJQUOTA || type == ZFS_PROP_PROJECTOBJUSED); cp = strchr(propname, '@') + 1; if (isuser && (pw = getpwnam(cp)) != NULL) { if (zoned && getzoneid() == GLOBAL_ZONEID) return (ENOENT); *ridp = pw->pw_uid; } else if (isgroup && (gr = getgrnam(cp)) != NULL) { if (zoned && getzoneid() == GLOBAL_ZONEID) return (ENOENT); *ridp = gr->gr_gid; } else if (!isproject && strchr(cp, '@')) { #ifdef HAVE_IDMAP /* * It's a SID name (eg "user@domain") that needs to be * turned into S-1-domainID-RID. */ directory_error_t e; char *numericsid = NULL; char *end; if (zoned && getzoneid() == GLOBAL_ZONEID) return (ENOENT); if (isuser) { e = directory_sid_from_user_name(NULL, cp, &numericsid); } else { e = directory_sid_from_group_name(NULL, cp, &numericsid); } if (e != NULL) { directory_error_free(e); return (ENOENT); } if (numericsid == NULL) return (ENOENT); cp = numericsid; (void) strlcpy(domain, cp, domainlen); cp = strrchr(domain, '-'); *cp = '\0'; cp++; errno = 0; *ridp = strtoull(cp, &end, 10); free(numericsid); if (errno != 0 || *end != '\0') return (EINVAL); #else (void) domainlen; return (ENOSYS); #endif /* HAVE_IDMAP */ } else { /* It's a user/group/project ID (eg "12345"). */ uid_t id; char *end; id = strtoul(cp, &end, 10); if (*end != '\0') return (EINVAL); if (id > MAXUID && !isproject) { #ifdef HAVE_IDMAP /* It's an ephemeral ID. */ idmap_rid_t rid; char *mapdomain; if (idmap_id_to_numeric_domain_rid(id, isuser, &mapdomain, &rid) != 0) return (ENOENT); (void) strlcpy(domain, mapdomain, domainlen); *ridp = rid; #else return (ENOSYS); #endif /* HAVE_IDMAP */ } else { *ridp = id; } } return (0); } static int zfs_prop_get_userquota_common(zfs_handle_t *zhp, const char *propname, uint64_t *propvalue, zfs_userquota_prop_t *typep) { int err; zfs_cmd_t zc = {"\0"}; (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); err = userquota_propname_decode(propname, zfs_prop_get_int(zhp, ZFS_PROP_ZONED), typep, zc.zc_value, sizeof (zc.zc_value), &zc.zc_guid); zc.zc_objset_type = *typep; if (err) return (err); err = zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_USERSPACE_ONE, &zc); if (err) return (err); *propvalue = zc.zc_cookie; return (0); } int zfs_prop_get_userquota_int(zfs_handle_t *zhp, const char *propname, uint64_t *propvalue) { zfs_userquota_prop_t type; return (zfs_prop_get_userquota_common(zhp, propname, propvalue, &type)); } int zfs_prop_get_userquota(zfs_handle_t *zhp, const char *propname, char *propbuf, int proplen, boolean_t literal) { int err; uint64_t propvalue; zfs_userquota_prop_t type; err = zfs_prop_get_userquota_common(zhp, propname, &propvalue, &type); if (err) return (err); if (literal) { (void) snprintf(propbuf, proplen, "%llu", (u_longlong_t)propvalue); } else if (propvalue == 0 && (type == ZFS_PROP_USERQUOTA || type == ZFS_PROP_GROUPQUOTA || type == ZFS_PROP_USEROBJQUOTA || type == ZFS_PROP_GROUPOBJQUOTA || type == ZFS_PROP_PROJECTQUOTA || type == ZFS_PROP_PROJECTOBJQUOTA)) { (void) strlcpy(propbuf, "none", proplen); } else if (type == ZFS_PROP_USERQUOTA || type == ZFS_PROP_GROUPQUOTA || type == ZFS_PROP_USERUSED || type == ZFS_PROP_GROUPUSED || type == ZFS_PROP_PROJECTUSED || type == ZFS_PROP_PROJECTQUOTA) { zfs_nicebytes(propvalue, propbuf, proplen); } else { zfs_nicenum(propvalue, propbuf, proplen); } return (0); } /* * propname must start with "written@" or "written#". */ int zfs_prop_get_written_int(zfs_handle_t *zhp, const char *propname, uint64_t *propvalue) { int err; zfs_cmd_t zc = {"\0"}; const char *snapname; (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); assert(zfs_prop_written(propname)); snapname = propname + strlen("written@"); if (strchr(snapname, '@') != NULL || strchr(snapname, '#') != NULL) { /* full snapshot or bookmark name specified */ (void) strlcpy(zc.zc_value, snapname, sizeof (zc.zc_value)); } else { /* snapname is the short name, append it to zhp's fsname */ char *cp; (void) strlcpy(zc.zc_value, zhp->zfs_name, sizeof (zc.zc_value)); cp = strchr(zc.zc_value, '@'); if (cp != NULL) *cp = '\0'; (void) strlcat(zc.zc_value, snapname - 1, sizeof (zc.zc_value)); } err = zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_SPACE_WRITTEN, &zc); if (err) return (err); *propvalue = zc.zc_cookie; return (0); } int zfs_prop_get_written(zfs_handle_t *zhp, const char *propname, char *propbuf, int proplen, boolean_t literal) { int err; uint64_t propvalue; err = zfs_prop_get_written_int(zhp, propname, &propvalue); if (err) return (err); if (literal) { (void) snprintf(propbuf, proplen, "%llu", (u_longlong_t)propvalue); } else { zfs_nicebytes(propvalue, propbuf, proplen); } return (0); } /* * Returns the name of the given zfs handle. */ const char * zfs_get_name(const zfs_handle_t *zhp) { return (zhp->zfs_name); } /* * Returns the name of the parent pool for the given zfs handle. */ const char * zfs_get_pool_name(const zfs_handle_t *zhp) { return (zhp->zpool_hdl->zpool_name); } /* * Returns the type of the given zfs handle. */ zfs_type_t zfs_get_type(const zfs_handle_t *zhp) { return (zhp->zfs_type); } /* * Returns the type of the given zfs handle, * or, if a snapshot, the type of the snapshotted dataset. */ zfs_type_t zfs_get_underlying_type(const zfs_handle_t *zhp) { return (zhp->zfs_head_type); } /* * Is one dataset name a child dataset of another? * * Needs to handle these cases: * Dataset 1 "a/foo" "a/foo" "a/foo" "a/foo" * Dataset 2 "a/fo" "a/foobar" "a/bar/baz" "a/foo/bar" * Descendant? No. No. No. Yes. */ static boolean_t is_descendant(const char *ds1, const char *ds2) { size_t d1len = strlen(ds1); /* ds2 can't be a descendant if it's smaller */ if (strlen(ds2) < d1len) return (B_FALSE); /* otherwise, compare strings and verify that there's a '/' char */ return (ds2[d1len] == '/' && (strncmp(ds1, ds2, d1len) == 0)); } /* * Given a complete name, return just the portion that refers to the parent. * Will return -1 if there is no parent (path is just the name of the * pool). */ static int parent_name(const char *path, char *buf, size_t buflen) { char *slashp; (void) strlcpy(buf, path, buflen); if ((slashp = strrchr(buf, '/')) == NULL) return (-1); *slashp = '\0'; return (0); } int zfs_parent_name(zfs_handle_t *zhp, char *buf, size_t buflen) { return (parent_name(zfs_get_name(zhp), buf, buflen)); } /* * If accept_ancestor is false, then check to make sure that the given path has * a parent, and that it exists. If accept_ancestor is true, then find the * closest existing ancestor for the given path. In prefixlen return the * length of already existing prefix of the given path. We also fetch the * 'zoned' property, which is used to validate property settings when creating * new datasets. */ static int check_parents(libzfs_handle_t *hdl, const char *path, uint64_t *zoned, boolean_t accept_ancestor, int *prefixlen) { zfs_cmd_t zc = {"\0"}; char parent[ZFS_MAX_DATASET_NAME_LEN]; char *slash; zfs_handle_t *zhp; char errbuf[ERRBUFLEN]; uint64_t is_zoned; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot create '%s'"), path); /* get parent, and check to see if this is just a pool */ if (parent_name(path, parent, sizeof (parent)) != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "missing dataset name")); return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); } /* check to see if the pool exists */ if ((slash = strchr(parent, '/')) == NULL) slash = parent + strlen(parent); (void) strlcpy(zc.zc_name, parent, MIN(sizeof (zc.zc_name), slash - parent + 1)); if (zfs_ioctl(hdl, ZFS_IOC_OBJSET_STATS, &zc) != 0 && errno == ENOENT) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "no such pool '%s'"), zc.zc_name); return (zfs_error(hdl, EZFS_NOENT, errbuf)); } /* check to see if the parent dataset exists */ while ((zhp = make_dataset_handle(hdl, parent)) == NULL) { if (errno == ENOENT && accept_ancestor) { /* * Go deeper to find an ancestor, give up on top level. */ if (parent_name(parent, parent, sizeof (parent)) != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "no such pool '%s'"), zc.zc_name); return (zfs_error(hdl, EZFS_NOENT, errbuf)); } } else if (errno == ENOENT) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "parent does not exist")); return (zfs_error(hdl, EZFS_NOENT, errbuf)); } else return (zfs_standard_error(hdl, errno, errbuf)); } is_zoned = zfs_prop_get_int(zhp, ZFS_PROP_ZONED); if (zoned != NULL) *zoned = is_zoned; /* we are in a non-global zone, but parent is in the global zone */ if (getzoneid() != GLOBAL_ZONEID && !is_zoned) { (void) zfs_standard_error(hdl, EPERM, errbuf); zfs_close(zhp); return (-1); } /* make sure parent is a filesystem */ if (zfs_get_type(zhp) != ZFS_TYPE_FILESYSTEM) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "parent is not a filesystem")); (void) zfs_error(hdl, EZFS_BADTYPE, errbuf); zfs_close(zhp); return (-1); } zfs_close(zhp); if (prefixlen != NULL) *prefixlen = strlen(parent); return (0); } /* * Finds whether the dataset of the given type(s) exists. */ boolean_t zfs_dataset_exists(libzfs_handle_t *hdl, const char *path, zfs_type_t types) { zfs_handle_t *zhp; if (!zfs_validate_name(hdl, path, types, B_FALSE)) return (B_FALSE); /* * Try to get stats for the dataset, which will tell us if it exists. */ if ((zhp = make_dataset_handle(hdl, path)) != NULL) { int ds_type = zhp->zfs_type; zfs_close(zhp); if (types & ds_type) return (B_TRUE); } return (B_FALSE); } /* * Given a path to 'target', create all the ancestors between * the prefixlen portion of the path, and the target itself. * Fail if the initial prefixlen-ancestor does not already exist. */ int create_parents(libzfs_handle_t *hdl, char *target, int prefixlen) { zfs_handle_t *h; char *cp; const char *opname; /* make sure prefix exists */ cp = target + prefixlen; if (*cp != '/') { assert(strchr(cp, '/') == NULL); h = zfs_open(hdl, target, ZFS_TYPE_FILESYSTEM); } else { *cp = '\0'; h = zfs_open(hdl, target, ZFS_TYPE_FILESYSTEM); *cp = '/'; } if (h == NULL) return (-1); zfs_close(h); /* * Attempt to create, mount, and share any ancestor filesystems, * up to the prefixlen-long one. */ for (cp = target + prefixlen + 1; (cp = strchr(cp, '/')) != NULL; *cp = '/', cp++) { *cp = '\0'; h = make_dataset_handle(hdl, target); if (h) { /* it already exists, nothing to do here */ zfs_close(h); continue; } if (zfs_create(hdl, target, ZFS_TYPE_FILESYSTEM, NULL) != 0) { opname = dgettext(TEXT_DOMAIN, "create"); goto ancestorerr; } h = zfs_open(hdl, target, ZFS_TYPE_FILESYSTEM); if (h == NULL) { opname = dgettext(TEXT_DOMAIN, "open"); goto ancestorerr; } if (zfs_mount(h, NULL, 0) != 0) { opname = dgettext(TEXT_DOMAIN, "mount"); goto ancestorerr; } if (zfs_share(h, NULL) != 0) { opname = dgettext(TEXT_DOMAIN, "share"); goto ancestorerr; } zfs_close(h); } zfs_commit_shares(NULL); return (0); ancestorerr: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "failed to %s ancestor '%s'"), opname, target); return (-1); } /* * Creates non-existing ancestors of the given path. */ int zfs_create_ancestors(libzfs_handle_t *hdl, const char *path) { int prefix; char *path_copy; char errbuf[ERRBUFLEN]; int rc = 0; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot create '%s'"), path); /* * Check that we are not passing the nesting limit * before we start creating any ancestors. */ if (dataset_nestcheck(path) != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "maximum name nesting depth exceeded")); return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); } if (check_parents(hdl, path, NULL, B_TRUE, &prefix) != 0) return (-1); if ((path_copy = strdup(path)) != NULL) { rc = create_parents(hdl, path_copy, prefix); free(path_copy); } if (path_copy == NULL || rc != 0) return (-1); return (0); } /* * Create a new filesystem or volume. */ int zfs_create(libzfs_handle_t *hdl, const char *path, zfs_type_t type, nvlist_t *props) { int ret; uint64_t size = 0; uint64_t blocksize = zfs_prop_default_numeric(ZFS_PROP_VOLBLOCKSIZE); uint64_t zoned; enum lzc_dataset_type ost; zpool_handle_t *zpool_handle; uint8_t *wkeydata = NULL; uint_t wkeylen = 0; char errbuf[ERRBUFLEN]; char parent[ZFS_MAX_DATASET_NAME_LEN]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot create '%s'"), path); /* validate the path, taking care to note the extended error message */ if (!zfs_validate_name(hdl, path, type, B_TRUE)) return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); if (dataset_nestcheck(path) != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "maximum name nesting depth exceeded")); return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); } /* validate parents exist */ if (check_parents(hdl, path, &zoned, B_FALSE, NULL) != 0) return (-1); /* * The failure modes when creating a dataset of a different type over * one that already exists is a little strange. In particular, if you * try to create a dataset on top of an existing dataset, the ioctl() * will return ENOENT, not EEXIST. To prevent this from happening, we * first try to see if the dataset exists. */ if (zfs_dataset_exists(hdl, path, ZFS_TYPE_DATASET)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "dataset already exists")); return (zfs_error(hdl, EZFS_EXISTS, errbuf)); } if (type == ZFS_TYPE_VOLUME) ost = LZC_DATSET_TYPE_ZVOL; else ost = LZC_DATSET_TYPE_ZFS; /* open zpool handle for prop validation */ char pool_path[ZFS_MAX_DATASET_NAME_LEN]; (void) strlcpy(pool_path, path, sizeof (pool_path)); /* truncate pool_path at first slash */ char *p = strchr(pool_path, '/'); if (p != NULL) *p = '\0'; if ((zpool_handle = zpool_open(hdl, pool_path)) == NULL) return (-1); if (props && (props = zfs_valid_proplist(hdl, type, props, zoned, NULL, zpool_handle, B_TRUE, errbuf)) == 0) { zpool_close(zpool_handle); return (-1); } zpool_close(zpool_handle); if (type == ZFS_TYPE_VOLUME) { /* * If we are creating a volume, the size and block size must * satisfy a few restraints. First, the blocksize must be a * valid block size between SPA_{MIN,MAX}BLOCKSIZE. Second, the * volsize must be a multiple of the block size, and cannot be * zero. */ if (props == NULL || nvlist_lookup_uint64(props, zfs_prop_to_name(ZFS_PROP_VOLSIZE), &size) != 0) { nvlist_free(props); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "missing volume size")); return (zfs_error(hdl, EZFS_BADPROP, errbuf)); } if ((ret = nvlist_lookup_uint64(props, zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &blocksize)) != 0) { if (ret == ENOENT) { blocksize = zfs_prop_default_numeric( ZFS_PROP_VOLBLOCKSIZE); } else { nvlist_free(props); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "missing volume block size")); return (zfs_error(hdl, EZFS_BADPROP, errbuf)); } } if (size == 0) { nvlist_free(props); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "volume size cannot be zero")); return (zfs_error(hdl, EZFS_BADPROP, errbuf)); } if (size % blocksize != 0) { nvlist_free(props); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "volume size must be a multiple of volume block " "size")); return (zfs_error(hdl, EZFS_BADPROP, errbuf)); } } (void) parent_name(path, parent, sizeof (parent)); if (zfs_crypto_create(hdl, parent, props, NULL, B_TRUE, &wkeydata, &wkeylen) != 0) { nvlist_free(props); return (zfs_error(hdl, EZFS_CRYPTOFAILED, errbuf)); } /* create the dataset */ ret = lzc_create(path, ost, props, wkeydata, wkeylen); nvlist_free(props); if (wkeydata != NULL) free(wkeydata); /* check for failure */ if (ret != 0) { switch (errno) { case ENOENT: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "no such parent '%s'"), parent); return (zfs_error(hdl, EZFS_NOENT, errbuf)); case ENOTSUP: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool must be upgraded to set this " "property or value")); return (zfs_error(hdl, EZFS_BADVERSION, errbuf)); case EACCES: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "encryption root's key is not loaded " "or provided")); return (zfs_error(hdl, EZFS_CRYPTOFAILED, errbuf)); case ERANGE: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid property value(s) specified")); return (zfs_error(hdl, EZFS_BADPROP, errbuf)); #ifdef _ILP32 case EOVERFLOW: /* * This platform can't address a volume this big. */ if (type == ZFS_TYPE_VOLUME) return (zfs_error(hdl, EZFS_VOLTOOBIG, errbuf)); zfs_fallthrough; #endif default: return (zfs_standard_error(hdl, errno, errbuf)); } } return (0); } /* * Destroys the given dataset. The caller must make sure that the filesystem * isn't mounted, and that there are no active dependents. If the file system * does not exist this function does nothing. */ int zfs_destroy(zfs_handle_t *zhp, boolean_t defer) { int error; if (zhp->zfs_type != ZFS_TYPE_SNAPSHOT && defer) return (EINVAL); if (zhp->zfs_type == ZFS_TYPE_BOOKMARK) { nvlist_t *nv = fnvlist_alloc(); fnvlist_add_boolean(nv, zhp->zfs_name); error = lzc_destroy_bookmarks(nv, NULL); fnvlist_free(nv); if (error != 0) { return (zfs_standard_error_fmt(zhp->zfs_hdl, error, dgettext(TEXT_DOMAIN, "cannot destroy '%s'"), zhp->zfs_name)); } return (0); } if (zhp->zfs_type == ZFS_TYPE_SNAPSHOT) { nvlist_t *nv = fnvlist_alloc(); fnvlist_add_boolean(nv, zhp->zfs_name); error = lzc_destroy_snaps(nv, defer, NULL); fnvlist_free(nv); } else { error = lzc_destroy(zhp->zfs_name); } if (error != 0 && error != ENOENT) { return (zfs_standard_error_fmt(zhp->zfs_hdl, errno, dgettext(TEXT_DOMAIN, "cannot destroy '%s'"), zhp->zfs_name)); } remove_mountpoint(zhp); return (0); } struct destroydata { nvlist_t *nvl; const char *snapname; }; static int zfs_check_snap_cb(zfs_handle_t *zhp, void *arg) { struct destroydata *dd = arg; char name[ZFS_MAX_DATASET_NAME_LEN]; int rv = 0; if (snprintf(name, sizeof (name), "%s@%s", zhp->zfs_name, dd->snapname) >= sizeof (name)) return (EINVAL); if (lzc_exists(name)) fnvlist_add_boolean(dd->nvl, name); rv = zfs_iter_filesystems(zhp, zfs_check_snap_cb, dd); zfs_close(zhp); return (rv); } /* * Destroys all snapshots with the given name in zhp & descendants. */ int zfs_destroy_snaps(zfs_handle_t *zhp, char *snapname, boolean_t defer) { int ret; struct destroydata dd = { 0 }; dd.snapname = snapname; dd.nvl = fnvlist_alloc(); (void) zfs_check_snap_cb(zfs_handle_dup(zhp), &dd); if (nvlist_empty(dd.nvl)) { ret = zfs_standard_error_fmt(zhp->zfs_hdl, ENOENT, dgettext(TEXT_DOMAIN, "cannot destroy '%s@%s'"), zhp->zfs_name, snapname); } else { ret = zfs_destroy_snaps_nvl(zhp->zfs_hdl, dd.nvl, defer); } fnvlist_free(dd.nvl); return (ret); } /* * Destroys all the snapshots named in the nvlist. */ int zfs_destroy_snaps_nvl(libzfs_handle_t *hdl, nvlist_t *snaps, boolean_t defer) { nvlist_t *errlist = NULL; nvpair_t *pair; int ret = zfs_destroy_snaps_nvl_os(hdl, snaps); if (ret != 0) return (ret); ret = lzc_destroy_snaps(snaps, defer, &errlist); if (ret == 0) { nvlist_free(errlist); return (0); } if (nvlist_empty(errlist)) { char errbuf[ERRBUFLEN]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot destroy snapshots")); ret = zfs_standard_error(hdl, ret, errbuf); } for (pair = nvlist_next_nvpair(errlist, NULL); pair != NULL; pair = nvlist_next_nvpair(errlist, pair)) { char errbuf[ERRBUFLEN]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot destroy snapshot %s"), nvpair_name(pair)); switch (fnvpair_value_int32(pair)) { case EEXIST: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "snapshot is cloned")); ret = zfs_error(hdl, EZFS_EXISTS, errbuf); break; default: ret = zfs_standard_error(hdl, errno, errbuf); break; } } nvlist_free(errlist); return (ret); } /* * Clones the given dataset. The target must be of the same type as the source. */ int zfs_clone(zfs_handle_t *zhp, const char *target, nvlist_t *props) { char parent[ZFS_MAX_DATASET_NAME_LEN]; int ret; char errbuf[ERRBUFLEN]; libzfs_handle_t *hdl = zhp->zfs_hdl; uint64_t zoned; assert(zhp->zfs_type == ZFS_TYPE_SNAPSHOT); (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot create '%s'"), target); /* validate the target/clone name */ if (!zfs_validate_name(hdl, target, ZFS_TYPE_FILESYSTEM, B_TRUE)) return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); /* validate parents exist */ if (check_parents(hdl, target, &zoned, B_FALSE, NULL) != 0) return (-1); (void) parent_name(target, parent, sizeof (parent)); /* do the clone */ if (props) { zfs_type_t type = ZFS_TYPE_FILESYSTEM; if (ZFS_IS_VOLUME(zhp)) type = ZFS_TYPE_VOLUME; if ((props = zfs_valid_proplist(hdl, type, props, zoned, zhp, zhp->zpool_hdl, B_TRUE, errbuf)) == NULL) return (-1); if (zfs_fix_auto_resv(zhp, props) == -1) { nvlist_free(props); return (-1); } } if (zfs_crypto_clone_check(hdl, zhp, parent, props) != 0) { nvlist_free(props); return (zfs_error(hdl, EZFS_CRYPTOFAILED, errbuf)); } ret = lzc_clone(target, zhp->zfs_name, props); nvlist_free(props); if (ret != 0) { switch (errno) { case ENOENT: /* * The parent doesn't exist. We should have caught this * above, but there may a race condition that has since * destroyed the parent. * * At this point, we don't know whether it's the source * that doesn't exist anymore, or whether the target * dataset doesn't exist. */ zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, "no such parent '%s'"), parent); return (zfs_error(zhp->zfs_hdl, EZFS_NOENT, errbuf)); case EXDEV: zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, "source and target pools differ")); return (zfs_error(zhp->zfs_hdl, EZFS_CROSSTARGET, errbuf)); default: return (zfs_standard_error(zhp->zfs_hdl, errno, errbuf)); } } return (ret); } /* * Promotes the given clone fs to be the clone parent. */ int zfs_promote(zfs_handle_t *zhp) { libzfs_handle_t *hdl = zhp->zfs_hdl; char snapname[ZFS_MAX_DATASET_NAME_LEN]; int ret; char errbuf[ERRBUFLEN]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot promote '%s'"), zhp->zfs_name); if (zhp->zfs_type == ZFS_TYPE_SNAPSHOT) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "snapshots can not be promoted")); return (zfs_error(hdl, EZFS_BADTYPE, errbuf)); } if (zhp->zfs_dmustats.dds_origin[0] == '\0') { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "not a cloned filesystem")); return (zfs_error(hdl, EZFS_BADTYPE, errbuf)); } if (!zfs_validate_name(hdl, zhp->zfs_name, zhp->zfs_type, B_TRUE)) return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); ret = lzc_promote(zhp->zfs_name, snapname, sizeof (snapname)); if (ret != 0) { switch (ret) { case EACCES: /* * Promoting encrypted dataset outside its * encryption root. */ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot promote dataset outside its " "encryption root")); return (zfs_error(hdl, EZFS_EXISTS, errbuf)); case EEXIST: /* There is a conflicting snapshot name. */ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "conflicting snapshot '%s' from parent '%s'"), snapname, zhp->zfs_dmustats.dds_origin); return (zfs_error(hdl, EZFS_EXISTS, errbuf)); default: return (zfs_standard_error(hdl, ret, errbuf)); } } return (ret); } typedef struct snapdata { nvlist_t *sd_nvl; const char *sd_snapname; } snapdata_t; static int zfs_snapshot_cb(zfs_handle_t *zhp, void *arg) { snapdata_t *sd = arg; char name[ZFS_MAX_DATASET_NAME_LEN]; int rv = 0; if (zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT) == 0) { if (snprintf(name, sizeof (name), "%s@%s", zfs_get_name(zhp), sd->sd_snapname) >= sizeof (name)) return (EINVAL); fnvlist_add_boolean(sd->sd_nvl, name); rv = zfs_iter_filesystems(zhp, zfs_snapshot_cb, sd); } zfs_close(zhp); return (rv); } /* * Creates snapshots. The keys in the snaps nvlist are the snapshots to be * created. */ int zfs_snapshot_nvl(libzfs_handle_t *hdl, nvlist_t *snaps, nvlist_t *props) { int ret; char errbuf[ERRBUFLEN]; nvpair_t *elem; nvlist_t *errors; zpool_handle_t *zpool_hdl; char pool[ZFS_MAX_DATASET_NAME_LEN]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot create snapshots ")); elem = NULL; while ((elem = nvlist_next_nvpair(snaps, elem)) != NULL) { const char *snapname = nvpair_name(elem); /* validate the target name */ if (!zfs_validate_name(hdl, snapname, ZFS_TYPE_SNAPSHOT, B_TRUE)) { (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot create snapshot '%s'"), snapname); return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); } } /* * get pool handle for prop validation. assumes all snaps are in the * same pool, as does lzc_snapshot (below). */ elem = nvlist_next_nvpair(snaps, NULL); if (elem == NULL) return (-1); (void) strlcpy(pool, nvpair_name(elem), sizeof (pool)); pool[strcspn(pool, "/@")] = '\0'; zpool_hdl = zpool_open(hdl, pool); if (zpool_hdl == NULL) return (-1); if (props != NULL && (props = zfs_valid_proplist(hdl, ZFS_TYPE_SNAPSHOT, props, B_FALSE, NULL, zpool_hdl, B_FALSE, errbuf)) == NULL) { zpool_close(zpool_hdl); return (-1); } zpool_close(zpool_hdl); ret = lzc_snapshot(snaps, props, &errors); if (ret != 0) { boolean_t printed = B_FALSE; for (elem = nvlist_next_nvpair(errors, NULL); elem != NULL; elem = nvlist_next_nvpair(errors, elem)) { (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot create snapshot '%s'"), nvpair_name(elem)); (void) zfs_standard_error(hdl, fnvpair_value_int32(elem), errbuf); printed = B_TRUE; } if (!printed) { switch (ret) { case EXDEV: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "multiple snapshots of same " "fs not allowed")); (void) zfs_error(hdl, EZFS_EXISTS, errbuf); break; default: (void) zfs_standard_error(hdl, ret, errbuf); } } } nvlist_free(props); nvlist_free(errors); return (ret); } int zfs_snapshot(libzfs_handle_t *hdl, const char *path, boolean_t recursive, nvlist_t *props) { int ret; snapdata_t sd = { 0 }; char fsname[ZFS_MAX_DATASET_NAME_LEN]; char *cp; zfs_handle_t *zhp; char errbuf[ERRBUFLEN]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot snapshot %s"), path); if (!zfs_validate_name(hdl, path, ZFS_TYPE_SNAPSHOT, B_TRUE)) return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); (void) strlcpy(fsname, path, sizeof (fsname)); cp = strchr(fsname, '@'); *cp = '\0'; sd.sd_snapname = cp + 1; if ((zhp = zfs_open(hdl, fsname, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME)) == NULL) { return (-1); } sd.sd_nvl = fnvlist_alloc(); if (recursive) { (void) zfs_snapshot_cb(zfs_handle_dup(zhp), &sd); } else { fnvlist_add_boolean(sd.sd_nvl, path); } ret = zfs_snapshot_nvl(hdl, sd.sd_nvl, props); fnvlist_free(sd.sd_nvl); zfs_close(zhp); return (ret); } /* * Destroy any more recent snapshots. We invoke this callback on any dependents * of the snapshot first. If the 'cb_dependent' member is non-zero, then this * is a dependent and we should just destroy it without checking the transaction * group. */ typedef struct rollback_data { const char *cb_target; /* the snapshot */ uint64_t cb_create; /* creation time reference */ boolean_t cb_error; boolean_t cb_force; } rollback_data_t; static int rollback_destroy_dependent(zfs_handle_t *zhp, void *data) { rollback_data_t *cbp = data; prop_changelist_t *clp; /* We must destroy this clone; first unmount it */ clp = changelist_gather(zhp, ZFS_PROP_NAME, 0, cbp->cb_force ? MS_FORCE: 0); if (clp == NULL || changelist_prefix(clp) != 0) { cbp->cb_error = B_TRUE; zfs_close(zhp); return (0); } if (zfs_destroy(zhp, B_FALSE) != 0) cbp->cb_error = B_TRUE; else changelist_remove(clp, zhp->zfs_name); (void) changelist_postfix(clp); changelist_free(clp); zfs_close(zhp); return (0); } static int rollback_destroy(zfs_handle_t *zhp, void *data) { rollback_data_t *cbp = data; if (zfs_prop_get_int(zhp, ZFS_PROP_CREATETXG) > cbp->cb_create) { cbp->cb_error |= zfs_iter_dependents(zhp, B_FALSE, rollback_destroy_dependent, cbp); cbp->cb_error |= zfs_destroy(zhp, B_FALSE); } zfs_close(zhp); return (0); } /* * Given a dataset, rollback to a specific snapshot, discarding any * data changes since then and making it the active dataset. * * Any snapshots and bookmarks more recent than the target are * destroyed, along with their dependents (i.e. clones). */ int zfs_rollback(zfs_handle_t *zhp, zfs_handle_t *snap, boolean_t force) { rollback_data_t cb = { 0 }; int err; boolean_t restore_resv = 0; uint64_t old_volsize = 0, new_volsize; zfs_prop_t resv_prop = { 0 }; uint64_t min_txg = 0; assert(zhp->zfs_type == ZFS_TYPE_FILESYSTEM || zhp->zfs_type == ZFS_TYPE_VOLUME); /* * Destroy all recent snapshots and their dependents. */ cb.cb_force = force; cb.cb_target = snap->zfs_name; cb.cb_create = zfs_prop_get_int(snap, ZFS_PROP_CREATETXG); if (cb.cb_create > 0) min_txg = cb.cb_create; (void) zfs_iter_snapshots(zhp, B_FALSE, rollback_destroy, &cb, min_txg, 0); (void) zfs_iter_bookmarks(zhp, rollback_destroy, &cb); if (cb.cb_error) return (-1); /* * Now that we have verified that the snapshot is the latest, * rollback to the given snapshot. */ if (zhp->zfs_type == ZFS_TYPE_VOLUME) { if (zfs_which_resv_prop(zhp, &resv_prop) < 0) return (-1); old_volsize = zfs_prop_get_int(zhp, ZFS_PROP_VOLSIZE); restore_resv = (old_volsize == zfs_prop_get_int(zhp, resv_prop)); } /* * Pass both the filesystem and the wanted snapshot names, * we would get an error back if the snapshot is destroyed or * a new snapshot is created before this request is processed. */ err = lzc_rollback_to(zhp->zfs_name, snap->zfs_name); if (err != 0) { char errbuf[ERRBUFLEN]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot rollback '%s'"), zhp->zfs_name); switch (err) { case EEXIST: zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, "there is a snapshot or bookmark more recent " "than '%s'"), snap->zfs_name); (void) zfs_error(zhp->zfs_hdl, EZFS_EXISTS, errbuf); break; case ESRCH: zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, "'%s' is not found among snapshots of '%s'"), snap->zfs_name, zhp->zfs_name); (void) zfs_error(zhp->zfs_hdl, EZFS_NOENT, errbuf); break; case EINVAL: (void) zfs_error(zhp->zfs_hdl, EZFS_BADTYPE, errbuf); break; default: (void) zfs_standard_error(zhp->zfs_hdl, err, errbuf); } return (err); } /* * For volumes, if the pre-rollback volsize matched the pre- * rollback reservation and the volsize has changed then set * the reservation property to the post-rollback volsize. * Make a new handle since the rollback closed the dataset. */ if ((zhp->zfs_type == ZFS_TYPE_VOLUME) && (zhp = make_dataset_handle(zhp->zfs_hdl, zhp->zfs_name))) { if (restore_resv) { new_volsize = zfs_prop_get_int(zhp, ZFS_PROP_VOLSIZE); if (old_volsize != new_volsize) err = zfs_prop_set_int(zhp, resv_prop, new_volsize); } zfs_close(zhp); } return (err); } /* * Renames the given dataset. */ int zfs_rename(zfs_handle_t *zhp, const char *target, renameflags_t flags) { int ret = 0; zfs_cmd_t zc = {"\0"}; char *delim; prop_changelist_t *cl = NULL; char parent[ZFS_MAX_DATASET_NAME_LEN]; char property[ZFS_MAXPROPLEN]; libzfs_handle_t *hdl = zhp->zfs_hdl; char errbuf[ERRBUFLEN]; /* if we have the same exact name, just return success */ if (strcmp(zhp->zfs_name, target) == 0) return (0); (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot rename to '%s'"), target); /* make sure source name is valid */ if (!zfs_validate_name(hdl, zhp->zfs_name, zhp->zfs_type, B_TRUE)) return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); /* * Make sure the target name is valid */ if (zhp->zfs_type == ZFS_TYPE_SNAPSHOT) { if ((strchr(target, '@') == NULL) || *target == '@') { /* * Snapshot target name is abbreviated, * reconstruct full dataset name */ (void) strlcpy(parent, zhp->zfs_name, sizeof (parent)); delim = strchr(parent, '@'); if (strchr(target, '@') == NULL) *(++delim) = '\0'; else *delim = '\0'; (void) strlcat(parent, target, sizeof (parent)); target = parent; } else { /* * Make sure we're renaming within the same dataset. */ delim = strchr(target, '@'); if (strncmp(zhp->zfs_name, target, delim - target) != 0 || zhp->zfs_name[delim - target] != '@') { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "snapshots must be part of same " "dataset")); return (zfs_error(hdl, EZFS_CROSSTARGET, errbuf)); } } if (!zfs_validate_name(hdl, target, zhp->zfs_type, B_TRUE)) return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); } else { if (flags.recursive) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "recursive rename must be a snapshot")); return (zfs_error(hdl, EZFS_BADTYPE, errbuf)); } if (!zfs_validate_name(hdl, target, zhp->zfs_type, B_TRUE)) return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); /* validate parents */ if (check_parents(hdl, target, NULL, B_FALSE, NULL) != 0) return (-1); /* make sure we're in the same pool */ verify((delim = strchr(target, '/')) != NULL); if (strncmp(zhp->zfs_name, target, delim - target) != 0 || zhp->zfs_name[delim - target] != '/') { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "datasets must be within same pool")); return (zfs_error(hdl, EZFS_CROSSTARGET, errbuf)); } /* new name cannot be a child of the current dataset name */ if (is_descendant(zhp->zfs_name, target)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "New dataset name cannot be a descendant of " "current dataset name")); return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); } } (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot rename '%s'"), zhp->zfs_name); if (getzoneid() == GLOBAL_ZONEID && zfs_prop_get_int(zhp, ZFS_PROP_ZONED)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "dataset is used in a non-global zone")); return (zfs_error(hdl, EZFS_ZONED, errbuf)); } /* * Avoid unmounting file systems with mountpoint property set to * 'legacy' or 'none' even if -u option is not given. */ if (zhp->zfs_type == ZFS_TYPE_FILESYSTEM && !flags.recursive && !flags.nounmount && zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, property, sizeof (property), NULL, NULL, 0, B_FALSE) == 0 && (strcmp(property, "legacy") == 0 || strcmp(property, "none") == 0)) { flags.nounmount = B_TRUE; } if (flags.recursive) { char *parentname = zfs_strdup(zhp->zfs_hdl, zhp->zfs_name); delim = strchr(parentname, '@'); *delim = '\0'; zfs_handle_t *zhrp = zfs_open(zhp->zfs_hdl, parentname, ZFS_TYPE_DATASET); free(parentname); if (zhrp == NULL) { ret = -1; goto error; } zfs_close(zhrp); } else if (zhp->zfs_type != ZFS_TYPE_SNAPSHOT) { if ((cl = changelist_gather(zhp, ZFS_PROP_NAME, flags.nounmount ? CL_GATHER_DONT_UNMOUNT : CL_GATHER_ITER_MOUNTED, flags.forceunmount ? MS_FORCE : 0)) == NULL) return (-1); if (changelist_haszonedchild(cl)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "child dataset with inherited mountpoint is used " "in a non-global zone")); (void) zfs_error(hdl, EZFS_ZONED, errbuf); ret = -1; goto error; } if ((ret = changelist_prefix(cl)) != 0) goto error; } if (ZFS_IS_VOLUME(zhp)) zc.zc_objset_type = DMU_OST_ZVOL; else zc.zc_objset_type = DMU_OST_ZFS; (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); (void) strlcpy(zc.zc_value, target, sizeof (zc.zc_value)); zc.zc_cookie = !!flags.recursive; zc.zc_cookie |= (!!flags.nounmount) << 1; if ((ret = zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_RENAME, &zc)) != 0) { /* * if it was recursive, the one that actually failed will * be in zc.zc_name */ (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot rename '%s'"), zc.zc_name); if (flags.recursive && errno == EEXIST) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "a child dataset already has a snapshot " "with the new name")); (void) zfs_error(hdl, EZFS_EXISTS, errbuf); } else if (errno == EACCES) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot move encrypted child outside of " "its encryption root")); (void) zfs_error(hdl, EZFS_CRYPTOFAILED, errbuf); } else { (void) zfs_standard_error(zhp->zfs_hdl, errno, errbuf); } /* * On failure, we still want to remount any filesystems that * were previously mounted, so we don't alter the system state. */ if (cl != NULL) (void) changelist_postfix(cl); } else { if (cl != NULL) { changelist_rename(cl, zfs_get_name(zhp), target); ret = changelist_postfix(cl); } } error: if (cl != NULL) { changelist_free(cl); } return (ret); } nvlist_t * zfs_get_all_props(zfs_handle_t *zhp) { return (zhp->zfs_props); } nvlist_t * zfs_get_recvd_props(zfs_handle_t *zhp) { if (zhp->zfs_recvd_props == NULL) if (get_recvd_props_ioctl(zhp) != 0) return (NULL); return (zhp->zfs_recvd_props); } nvlist_t * zfs_get_user_props(zfs_handle_t *zhp) { return (zhp->zfs_user_props); } /* * This function is used by 'zfs list' to determine the exact set of columns to * display, and their maximum widths. This does two main things: * * - If this is a list of all properties, then expand the list to include * all native properties, and set a flag so that for each dataset we look * for new unique user properties and add them to the list. * * - For non fixed-width properties, keep track of the maximum width seen * so that we can size the column appropriately. If the user has * requested received property values, we also need to compute the width * of the RECEIVED column. */ int zfs_expand_proplist(zfs_handle_t *zhp, zprop_list_t **plp, boolean_t received, boolean_t literal) { libzfs_handle_t *hdl = zhp->zfs_hdl; zprop_list_t *entry; zprop_list_t **last, **start; nvlist_t *userprops, *propval; nvpair_t *elem; char *strval; char buf[ZFS_MAXPROPLEN]; if (zprop_expand_list(hdl, plp, ZFS_TYPE_DATASET) != 0) return (-1); userprops = zfs_get_user_props(zhp); entry = *plp; if (entry->pl_all && nvlist_next_nvpair(userprops, NULL) != NULL) { /* * Go through and add any user properties as necessary. We * start by incrementing our list pointer to the first * non-native property. */ start = plp; while (*start != NULL) { if ((*start)->pl_prop == ZPROP_USERPROP) break; start = &(*start)->pl_next; } elem = NULL; while ((elem = nvlist_next_nvpair(userprops, elem)) != NULL) { /* * See if we've already found this property in our list. */ for (last = start; *last != NULL; last = &(*last)->pl_next) { if (strcmp((*last)->pl_user_prop, nvpair_name(elem)) == 0) break; } if (*last == NULL) { entry = zfs_alloc(hdl, sizeof (zprop_list_t)); entry->pl_user_prop = zfs_strdup(hdl, nvpair_name(elem)); entry->pl_prop = ZPROP_USERPROP; entry->pl_width = strlen(nvpair_name(elem)); entry->pl_all = B_TRUE; *last = entry; } } } /* * Now go through and check the width of any non-fixed columns */ for (entry = *plp; entry != NULL; entry = entry->pl_next) { if (entry->pl_fixed && !literal) continue; if (entry->pl_prop != ZPROP_USERPROP) { if (zfs_prop_get(zhp, entry->pl_prop, buf, sizeof (buf), NULL, NULL, 0, literal) == 0) { if (strlen(buf) > entry->pl_width) entry->pl_width = strlen(buf); } if (received && zfs_prop_get_recvd(zhp, zfs_prop_to_name(entry->pl_prop), buf, sizeof (buf), literal) == 0) if (strlen(buf) > entry->pl_recvd_width) entry->pl_recvd_width = strlen(buf); } else { if (nvlist_lookup_nvlist(userprops, entry->pl_user_prop, &propval) == 0) { strval = fnvlist_lookup_string(propval, ZPROP_VALUE); if (strlen(strval) > entry->pl_width) entry->pl_width = strlen(strval); } if (received && zfs_prop_get_recvd(zhp, entry->pl_user_prop, buf, sizeof (buf), literal) == 0) if (strlen(buf) > entry->pl_recvd_width) entry->pl_recvd_width = strlen(buf); } } return (0); } void zfs_prune_proplist(zfs_handle_t *zhp, uint8_t *props) { nvpair_t *curr; nvpair_t *next; /* * Keep a reference to the props-table against which we prune the * properties. */ zhp->zfs_props_table = props; curr = nvlist_next_nvpair(zhp->zfs_props, NULL); while (curr) { zfs_prop_t zfs_prop = zfs_name_to_prop(nvpair_name(curr)); next = nvlist_next_nvpair(zhp->zfs_props, curr); /* * User properties will result in ZPROP_USERPROP (an alias * for ZPROP_INVAL), and since we * only know how to prune standard ZFS properties, we always * leave these in the list. This can also happen if we * encounter an unknown DSL property (when running older * software, for example). */ if (zfs_prop != ZPROP_USERPROP && props[zfs_prop] == B_FALSE) (void) nvlist_remove(zhp->zfs_props, nvpair_name(curr), nvpair_type(curr)); curr = next; } } static int zfs_smb_acl_mgmt(libzfs_handle_t *hdl, char *dataset, char *path, zfs_smb_acl_op_t cmd, char *resource1, char *resource2) { zfs_cmd_t zc = {"\0"}; nvlist_t *nvlist = NULL; int error; (void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name)); (void) strlcpy(zc.zc_value, path, sizeof (zc.zc_value)); zc.zc_cookie = (uint64_t)cmd; if (cmd == ZFS_SMB_ACL_RENAME) { if (nvlist_alloc(&nvlist, NV_UNIQUE_NAME, 0) != 0) { (void) no_memory(hdl); return (0); } } switch (cmd) { case ZFS_SMB_ACL_ADD: case ZFS_SMB_ACL_REMOVE: (void) strlcpy(zc.zc_string, resource1, sizeof (zc.zc_string)); break; case ZFS_SMB_ACL_RENAME: if (nvlist_add_string(nvlist, ZFS_SMB_ACL_SRC, resource1) != 0) { (void) no_memory(hdl); return (-1); } if (nvlist_add_string(nvlist, ZFS_SMB_ACL_TARGET, resource2) != 0) { (void) no_memory(hdl); return (-1); } zcmd_write_src_nvlist(hdl, &zc, nvlist); break; case ZFS_SMB_ACL_PURGE: break; default: return (-1); } error = ioctl(hdl->libzfs_fd, ZFS_IOC_SMB_ACL, &zc); nvlist_free(nvlist); return (error); } int zfs_smb_acl_add(libzfs_handle_t *hdl, char *dataset, char *path, char *resource) { return (zfs_smb_acl_mgmt(hdl, dataset, path, ZFS_SMB_ACL_ADD, resource, NULL)); } int zfs_smb_acl_remove(libzfs_handle_t *hdl, char *dataset, char *path, char *resource) { return (zfs_smb_acl_mgmt(hdl, dataset, path, ZFS_SMB_ACL_REMOVE, resource, NULL)); } int zfs_smb_acl_purge(libzfs_handle_t *hdl, char *dataset, char *path) { return (zfs_smb_acl_mgmt(hdl, dataset, path, ZFS_SMB_ACL_PURGE, NULL, NULL)); } int zfs_smb_acl_rename(libzfs_handle_t *hdl, char *dataset, char *path, char *oldname, char *newname) { return (zfs_smb_acl_mgmt(hdl, dataset, path, ZFS_SMB_ACL_RENAME, oldname, newname)); } int zfs_userspace(zfs_handle_t *zhp, zfs_userquota_prop_t type, zfs_userspace_cb_t func, void *arg) { zfs_cmd_t zc = {"\0"}; zfs_useracct_t buf[100]; libzfs_handle_t *hdl = zhp->zfs_hdl; int ret; (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); zc.zc_objset_type = type; zc.zc_nvlist_dst = (uintptr_t)buf; for (;;) { zfs_useracct_t *zua = buf; zc.zc_nvlist_dst_size = sizeof (buf); if (zfs_ioctl(hdl, ZFS_IOC_USERSPACE_MANY, &zc) != 0) { if ((errno == ENOTSUP && (type == ZFS_PROP_USEROBJUSED || type == ZFS_PROP_GROUPOBJUSED || type == ZFS_PROP_USEROBJQUOTA || type == ZFS_PROP_GROUPOBJQUOTA || type == ZFS_PROP_PROJECTOBJUSED || type == ZFS_PROP_PROJECTOBJQUOTA || type == ZFS_PROP_PROJECTUSED || type == ZFS_PROP_PROJECTQUOTA))) break; return (zfs_standard_error_fmt(hdl, errno, dgettext(TEXT_DOMAIN, "cannot get used/quota for %s"), zc.zc_name)); } if (zc.zc_nvlist_dst_size == 0) break; while (zc.zc_nvlist_dst_size > 0) { if ((ret = func(arg, zua->zu_domain, zua->zu_rid, zua->zu_space)) != 0) return (ret); zua++; zc.zc_nvlist_dst_size -= sizeof (zfs_useracct_t); } } return (0); } struct holdarg { nvlist_t *nvl; const char *snapname; const char *tag; boolean_t recursive; int error; }; static int zfs_hold_one(zfs_handle_t *zhp, void *arg) { struct holdarg *ha = arg; char name[ZFS_MAX_DATASET_NAME_LEN]; int rv = 0; if (snprintf(name, sizeof (name), "%s@%s", zhp->zfs_name, ha->snapname) >= sizeof (name)) return (EINVAL); if (lzc_exists(name)) fnvlist_add_string(ha->nvl, name, ha->tag); if (ha->recursive) rv = zfs_iter_filesystems(zhp, zfs_hold_one, ha); zfs_close(zhp); return (rv); } int zfs_hold(zfs_handle_t *zhp, const char *snapname, const char *tag, boolean_t recursive, int cleanup_fd) { int ret; struct holdarg ha; ha.nvl = fnvlist_alloc(); ha.snapname = snapname; ha.tag = tag; ha.recursive = recursive; (void) zfs_hold_one(zfs_handle_dup(zhp), &ha); if (nvlist_empty(ha.nvl)) { char errbuf[ERRBUFLEN]; fnvlist_free(ha.nvl); ret = ENOENT; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot hold snapshot '%s@%s'"), zhp->zfs_name, snapname); (void) zfs_standard_error(zhp->zfs_hdl, ret, errbuf); return (ret); } ret = zfs_hold_nvl(zhp, cleanup_fd, ha.nvl); fnvlist_free(ha.nvl); return (ret); } int zfs_hold_nvl(zfs_handle_t *zhp, int cleanup_fd, nvlist_t *holds) { int ret; nvlist_t *errors; libzfs_handle_t *hdl = zhp->zfs_hdl; char errbuf[ERRBUFLEN]; nvpair_t *elem; errors = NULL; ret = lzc_hold(holds, cleanup_fd, &errors); if (ret == 0) { /* There may be errors even in the success case. */ fnvlist_free(errors); return (0); } if (nvlist_empty(errors)) { /* no hold-specific errors */ (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot hold")); switch (ret) { case ENOTSUP: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool must be upgraded")); (void) zfs_error(hdl, EZFS_BADVERSION, errbuf); break; case EINVAL: (void) zfs_error(hdl, EZFS_BADTYPE, errbuf); break; default: (void) zfs_standard_error(hdl, ret, errbuf); } } for (elem = nvlist_next_nvpair(errors, NULL); elem != NULL; elem = nvlist_next_nvpair(errors, elem)) { (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot hold snapshot '%s'"), nvpair_name(elem)); switch (fnvpair_value_int32(elem)) { case E2BIG: /* * Temporary tags wind up having the ds object id * prepended. So even if we passed the length check * above, it's still possible for the tag to wind * up being slightly too long. */ (void) zfs_error(hdl, EZFS_TAGTOOLONG, errbuf); break; case EINVAL: (void) zfs_error(hdl, EZFS_BADTYPE, errbuf); break; case EEXIST: (void) zfs_error(hdl, EZFS_REFTAG_HOLD, errbuf); break; default: (void) zfs_standard_error(hdl, fnvpair_value_int32(elem), errbuf); } } fnvlist_free(errors); return (ret); } static int zfs_release_one(zfs_handle_t *zhp, void *arg) { struct holdarg *ha = arg; char name[ZFS_MAX_DATASET_NAME_LEN]; int rv = 0; nvlist_t *existing_holds; if (snprintf(name, sizeof (name), "%s@%s", zhp->zfs_name, ha->snapname) >= sizeof (name)) { ha->error = EINVAL; rv = EINVAL; } if (lzc_get_holds(name, &existing_holds) != 0) { ha->error = ENOENT; } else if (!nvlist_exists(existing_holds, ha->tag)) { ha->error = ESRCH; } else { nvlist_t *torelease = fnvlist_alloc(); fnvlist_add_boolean(torelease, ha->tag); fnvlist_add_nvlist(ha->nvl, name, torelease); fnvlist_free(torelease); } if (ha->recursive) rv = zfs_iter_filesystems(zhp, zfs_release_one, ha); zfs_close(zhp); return (rv); } int zfs_release(zfs_handle_t *zhp, const char *snapname, const char *tag, boolean_t recursive) { int ret; struct holdarg ha; nvlist_t *errors = NULL; nvpair_t *elem; libzfs_handle_t *hdl = zhp->zfs_hdl; char errbuf[ERRBUFLEN]; ha.nvl = fnvlist_alloc(); ha.snapname = snapname; ha.tag = tag; ha.recursive = recursive; ha.error = 0; (void) zfs_release_one(zfs_handle_dup(zhp), &ha); if (nvlist_empty(ha.nvl)) { fnvlist_free(ha.nvl); ret = ha.error; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot release hold from snapshot '%s@%s'"), zhp->zfs_name, snapname); if (ret == ESRCH) { (void) zfs_error(hdl, EZFS_REFTAG_RELE, errbuf); } else { (void) zfs_standard_error(hdl, ret, errbuf); } return (ret); } ret = lzc_release(ha.nvl, &errors); fnvlist_free(ha.nvl); if (ret == 0) { /* There may be errors even in the success case. */ fnvlist_free(errors); return (0); } if (nvlist_empty(errors)) { /* no hold-specific errors */ (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot release")); switch (errno) { case ENOTSUP: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool must be upgraded")); (void) zfs_error(hdl, EZFS_BADVERSION, errbuf); break; default: (void) zfs_standard_error(hdl, errno, errbuf); } } for (elem = nvlist_next_nvpair(errors, NULL); elem != NULL; elem = nvlist_next_nvpair(errors, elem)) { (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot release hold from snapshot '%s'"), nvpair_name(elem)); switch (fnvpair_value_int32(elem)) { case ESRCH: (void) zfs_error(hdl, EZFS_REFTAG_RELE, errbuf); break; case EINVAL: (void) zfs_error(hdl, EZFS_BADTYPE, errbuf); break; default: (void) zfs_standard_error(hdl, fnvpair_value_int32(elem), errbuf); } } fnvlist_free(errors); return (ret); } int zfs_get_fsacl(zfs_handle_t *zhp, nvlist_t **nvl) { zfs_cmd_t zc = {"\0"}; libzfs_handle_t *hdl = zhp->zfs_hdl; int nvsz = 2048; void *nvbuf; int err = 0; char errbuf[ERRBUFLEN]; assert(zhp->zfs_type == ZFS_TYPE_VOLUME || zhp->zfs_type == ZFS_TYPE_FILESYSTEM); tryagain: nvbuf = malloc(nvsz); if (nvbuf == NULL) { err = (zfs_error(hdl, EZFS_NOMEM, strerror(errno))); goto out; } zc.zc_nvlist_dst_size = nvsz; zc.zc_nvlist_dst = (uintptr_t)nvbuf; (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); if (zfs_ioctl(hdl, ZFS_IOC_GET_FSACL, &zc) != 0) { (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot get permissions on '%s'"), zc.zc_name); switch (errno) { case ENOMEM: free(nvbuf); nvsz = zc.zc_nvlist_dst_size; goto tryagain; case ENOTSUP: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool must be upgraded")); err = zfs_error(hdl, EZFS_BADVERSION, errbuf); break; case EINVAL: err = zfs_error(hdl, EZFS_BADTYPE, errbuf); break; case ENOENT: err = zfs_error(hdl, EZFS_NOENT, errbuf); break; default: err = zfs_standard_error(hdl, errno, errbuf); break; } } else { /* success */ int rc = nvlist_unpack(nvbuf, zc.zc_nvlist_dst_size, nvl, 0); if (rc) { err = zfs_standard_error_fmt(hdl, rc, dgettext( TEXT_DOMAIN, "cannot get permissions on '%s'"), zc.zc_name); } } free(nvbuf); out: return (err); } int zfs_set_fsacl(zfs_handle_t *zhp, boolean_t un, nvlist_t *nvl) { zfs_cmd_t zc = {"\0"}; libzfs_handle_t *hdl = zhp->zfs_hdl; char *nvbuf; char errbuf[ERRBUFLEN]; size_t nvsz; int err; assert(zhp->zfs_type == ZFS_TYPE_VOLUME || zhp->zfs_type == ZFS_TYPE_FILESYSTEM); err = nvlist_size(nvl, &nvsz, NV_ENCODE_NATIVE); assert(err == 0); nvbuf = malloc(nvsz); err = nvlist_pack(nvl, &nvbuf, &nvsz, NV_ENCODE_NATIVE, 0); assert(err == 0); zc.zc_nvlist_src_size = nvsz; zc.zc_nvlist_src = (uintptr_t)nvbuf; zc.zc_perm_action = un; (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); if (zfs_ioctl(hdl, ZFS_IOC_SET_FSACL, &zc) != 0) { (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot set permissions on '%s'"), zc.zc_name); switch (errno) { case ENOTSUP: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool must be upgraded")); err = zfs_error(hdl, EZFS_BADVERSION, errbuf); break; case EINVAL: err = zfs_error(hdl, EZFS_BADTYPE, errbuf); break; case ENOENT: err = zfs_error(hdl, EZFS_NOENT, errbuf); break; default: err = zfs_standard_error(hdl, errno, errbuf); break; } } free(nvbuf); return (err); } int zfs_get_holds(zfs_handle_t *zhp, nvlist_t **nvl) { int err; char errbuf[ERRBUFLEN]; err = lzc_get_holds(zhp->zfs_name, nvl); if (err != 0) { libzfs_handle_t *hdl = zhp->zfs_hdl; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot get holds for '%s'"), zhp->zfs_name); switch (err) { case ENOTSUP: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool must be upgraded")); err = zfs_error(hdl, EZFS_BADVERSION, errbuf); break; case EINVAL: err = zfs_error(hdl, EZFS_BADTYPE, errbuf); break; case ENOENT: err = zfs_error(hdl, EZFS_NOENT, errbuf); break; default: err = zfs_standard_error(hdl, errno, errbuf); break; } } return (err); } /* * The theory of raidz space accounting * * The "referenced" property of RAIDZ vdevs is scaled such that a 128KB block * will "reference" 128KB, even though it allocates more than that, to store the * parity information (and perhaps skip sectors). This concept of the * "referenced" (and other DMU space accounting) being lower than the allocated * space by a constant factor is called "raidz deflation." * * As mentioned above, the constant factor for raidz deflation assumes a 128KB * block size. However, zvols typically have a much smaller block size (default * 8KB). These smaller blocks may require proportionally much more parity * information (and perhaps skip sectors). In this case, the change to the * "referenced" property may be much more than the logical block size. * * Suppose a raidz vdev has 5 disks with ashift=12. A 128k block may be written * as follows. * * +-------+-------+-------+-------+-------+ * | disk1 | disk2 | disk3 | disk4 | disk5 | * +-------+-------+-------+-------+-------+ * | P0 | D0 | D8 | D16 | D24 | * | P1 | D1 | D9 | D17 | D25 | * | P2 | D2 | D10 | D18 | D26 | * | P3 | D3 | D11 | D19 | D27 | * | P4 | D4 | D12 | D20 | D28 | * | P5 | D5 | D13 | D21 | D29 | * | P6 | D6 | D14 | D22 | D30 | * | P7 | D7 | D15 | D23 | D31 | * +-------+-------+-------+-------+-------+ * * Above, notice that 160k was allocated: 8 x 4k parity sectors + 32 x 4k data * sectors. The dataset's referenced will increase by 128k and the pool's * allocated and free properties will be adjusted by 160k. * * A 4k block written to the same raidz vdev will require two 4k sectors. The * blank cells represent unallocated space. * * +-------+-------+-------+-------+-------+ * | disk1 | disk2 | disk3 | disk4 | disk5 | * +-------+-------+-------+-------+-------+ * | P0 | D0 | | | | * +-------+-------+-------+-------+-------+ * * Above, notice that the 4k block required one sector for parity and another * for data. vdev_raidz_asize() will return 8k and as such the pool's allocated * and free properties will be adjusted by 8k. The dataset will not be charged * 8k. Rather, it will be charged a value that is scaled according to the * overhead of the 128k block on the same vdev. This 8k allocation will be * charged 8k * 128k / 160k. 128k is from SPA_OLD_MAXBLOCKSIZE and 160k is as * calculated in the 128k block example above. * * Every raidz allocation is sized to be a multiple of nparity+1 sectors. That * is, every raidz1 allocation will be a multiple of 2 sectors, raidz2 * allocations are a multiple of 3 sectors, and raidz3 allocations are a * multiple of of 4 sectors. When a block does not fill the required number of * sectors, skip blocks (sectors) are used. * * An 8k block being written to a raidz vdev may be written as follows: * * +-------+-------+-------+-------+-------+ * | disk1 | disk2 | disk3 | disk4 | disk5 | * +-------+-------+-------+-------+-------+ * | P0 | D0 | D1 | S0 | | * +-------+-------+-------+-------+-------+ * * In order to maintain the nparity+1 allocation size, a skip block (S0) was * added. For this 8k block, the pool's allocated and free properties are * adjusted by 16k and the dataset's referenced is increased by 16k * 128k / * 160k. Again, 128k is from SPA_OLD_MAXBLOCKSIZE and 160k is as calculated in * the 128k block example above. * * The situation is slightly different for dRAID since the minimum allocation * size is the full group width. The same 8K block above would be written as * follows in a dRAID group: * * +-------+-------+-------+-------+-------+ * | disk1 | disk2 | disk3 | disk4 | disk5 | * +-------+-------+-------+-------+-------+ * | P0 | D0 | D1 | S0 | S1 | * +-------+-------+-------+-------+-------+ * * Compression may lead to a variety of block sizes being written for the same * volume or file. There is no clear way to reserve just the amount of space * that will be required, so the worst case (no compression) is assumed. * Note that metadata blocks will typically be compressed, so the reservation * size returned by zvol_volsize_to_reservation() will generally be slightly * larger than the maximum that the volume can reference. */ /* * Derived from function of same name in module/zfs/vdev_raidz.c. Returns the * amount of space (in bytes) that will be allocated for the specified block * size. Note that the "referenced" space accounted will be less than this, but * not necessarily equal to "blksize", due to RAIDZ deflation. */ static uint64_t vdev_raidz_asize(uint64_t ndisks, uint64_t nparity, uint64_t ashift, uint64_t blksize) { uint64_t asize, ndata; ASSERT3U(ndisks, >, nparity); ndata = ndisks - nparity; asize = ((blksize - 1) >> ashift) + 1; asize += nparity * ((asize + ndata - 1) / ndata); asize = roundup(asize, nparity + 1) << ashift; return (asize); } /* * Derived from function of same name in module/zfs/vdev_draid.c. Returns the * amount of space (in bytes) that will be allocated for the specified block * size. */ static uint64_t vdev_draid_asize(uint64_t ndisks, uint64_t nparity, uint64_t ashift, uint64_t blksize) { ASSERT3U(ndisks, >, nparity); uint64_t ndata = ndisks - nparity; uint64_t rows = ((blksize - 1) / (ndata << ashift)) + 1; uint64_t asize = (rows * ndisks) << ashift; return (asize); } /* * Determine how much space will be allocated if it lands on the most space- * inefficient top-level vdev. Returns the size in bytes required to store one * copy of the volume data. See theory comment above. */ static uint64_t volsize_from_vdevs(zpool_handle_t *zhp, uint64_t nblocks, uint64_t blksize) { nvlist_t *config, *tree, **vdevs; uint_t nvdevs; uint64_t ret = 0; config = zpool_get_config(zhp, NULL); if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &tree) != 0 || nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN, &vdevs, &nvdevs) != 0) { return (nblocks * blksize); } for (int v = 0; v < nvdevs; v++) { char *type; uint64_t nparity, ashift, asize, tsize; uint64_t volsize; if (nvlist_lookup_string(vdevs[v], ZPOOL_CONFIG_TYPE, &type) != 0) continue; if (strcmp(type, VDEV_TYPE_RAIDZ) != 0 && strcmp(type, VDEV_TYPE_DRAID) != 0) continue; if (nvlist_lookup_uint64(vdevs[v], ZPOOL_CONFIG_NPARITY, &nparity) != 0) continue; if (nvlist_lookup_uint64(vdevs[v], ZPOOL_CONFIG_ASHIFT, &ashift) != 0) continue; if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) { nvlist_t **disks; uint_t ndisks; if (nvlist_lookup_nvlist_array(vdevs[v], ZPOOL_CONFIG_CHILDREN, &disks, &ndisks) != 0) continue; /* allocation size for the "typical" 128k block */ tsize = vdev_raidz_asize(ndisks, nparity, ashift, SPA_OLD_MAXBLOCKSIZE); /* allocation size for the blksize block */ asize = vdev_raidz_asize(ndisks, nparity, ashift, blksize); } else { uint64_t ndata; if (nvlist_lookup_uint64(vdevs[v], ZPOOL_CONFIG_DRAID_NDATA, &ndata) != 0) continue; /* allocation size for the "typical" 128k block */ tsize = vdev_draid_asize(ndata + nparity, nparity, ashift, SPA_OLD_MAXBLOCKSIZE); /* allocation size for the blksize block */ asize = vdev_draid_asize(ndata + nparity, nparity, ashift, blksize); } /* * Scale this size down as a ratio of 128k / tsize. * See theory statement above. */ volsize = nblocks * asize * SPA_OLD_MAXBLOCKSIZE / tsize; if (volsize > ret) { ret = volsize; } } if (ret == 0) { ret = nblocks * blksize; } return (ret); } /* * Convert the zvol's volume size to an appropriate reservation. See theory * comment above. * * Note: If this routine is updated, it is necessary to update the ZFS test * suite's shell version in reservation.shlib. */ uint64_t zvol_volsize_to_reservation(zpool_handle_t *zph, uint64_t volsize, nvlist_t *props) { uint64_t numdb; uint64_t nblocks, volblocksize; int ncopies; char *strval; if (nvlist_lookup_string(props, zfs_prop_to_name(ZFS_PROP_COPIES), &strval) == 0) ncopies = atoi(strval); else ncopies = 1; if (nvlist_lookup_uint64(props, zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &volblocksize) != 0) volblocksize = ZVOL_DEFAULT_BLOCKSIZE; nblocks = volsize / volblocksize; /* * Metadata defaults to using 128k blocks, not volblocksize blocks. For * this reason, only the data blocks are scaled based on vdev config. */ volsize = volsize_from_vdevs(zph, nblocks, volblocksize); /* start with metadnode L0-L6 */ numdb = 7; /* calculate number of indirects */ while (nblocks > 1) { nblocks += DNODES_PER_LEVEL - 1; nblocks /= DNODES_PER_LEVEL; numdb += nblocks; } numdb *= MIN(SPA_DVAS_PER_BP, ncopies + 1); volsize *= ncopies; /* * this is exactly DN_MAX_INDBLKSHIFT when metadata isn't * compressed, but in practice they compress down to about * 1100 bytes */ numdb *= 1ULL << DN_MAX_INDBLKSHIFT; volsize += numdb; return (volsize); } /* * Wait for the given activity and return the status of the wait (whether or not * any waiting was done) in the 'waited' parameter. Non-existent fses are * reported via the 'missing' parameter, rather than by printing an error * message. This is convenient when this function is called in a loop over a * long period of time (as it is, for example, by zfs's wait cmd). In that * scenario, a fs being exported or destroyed should be considered a normal * event, so we don't want to print an error when we find that the fs doesn't * exist. */ int zfs_wait_status(zfs_handle_t *zhp, zfs_wait_activity_t activity, boolean_t *missing, boolean_t *waited) { int error = lzc_wait_fs(zhp->zfs_name, activity, waited); *missing = (error == ENOENT); if (*missing) return (0); if (error != 0) { (void) zfs_standard_error_fmt(zhp->zfs_hdl, error, dgettext(TEXT_DOMAIN, "error waiting in fs '%s'"), zhp->zfs_name); } return (error); } diff --git a/lib/libzfs/libzfs_diff.c b/lib/libzfs/libzfs_diff.c index 80588a860c18..84e140ede665 100644 --- a/lib/libzfs/libzfs_diff.c +++ b/lib/libzfs/libzfs_diff.c @@ -1,788 +1,788 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2015 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2015, 2018 by Delphix. All rights reserved. * Copyright 2016 Joyent, Inc. * Copyright 2016 Igor Kozhukhov */ /* * zfs diff support */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "libzfs_impl.h" #define ZDIFF_SNAPDIR "/.zfs/snapshot/" #define ZDIFF_PREFIX "zfs-diff-%d" #define ZDIFF_ADDED '+' #define ZDIFF_MODIFIED "M" #define ZDIFF_REMOVED '-' #define ZDIFF_RENAMED "R" /* * Given a {dsname, object id}, get the object path */ static int get_stats_for_obj(differ_info_t *di, const char *dsname, uint64_t obj, char *pn, int maxlen, zfs_stat_t *sb) { zfs_cmd_t zc = {"\0"}; int error; (void) strlcpy(zc.zc_name, dsname, sizeof (zc.zc_name)); zc.zc_obj = obj; errno = 0; error = zfs_ioctl(di->zhp->zfs_hdl, ZFS_IOC_OBJ_TO_STATS, &zc); di->zerr = errno; /* we can get stats even if we failed to get a path */ (void) memcpy(sb, &zc.zc_stat, sizeof (zfs_stat_t)); if (error == 0) { ASSERT(di->zerr == 0); (void) strlcpy(pn, zc.zc_value, maxlen); return (0); } if (di->zerr == ESTALE) { (void) snprintf(pn, maxlen, "(on_delete_queue)"); return (0); } else if (di->zerr == EPERM) { (void) snprintf(di->errbuf, sizeof (di->errbuf), dgettext(TEXT_DOMAIN, "The sys_config privilege or diff delegated permission " "is needed\nto discover path names")); return (-1); } else if (di->zerr == EACCES) { (void) snprintf(di->errbuf, sizeof (di->errbuf), dgettext(TEXT_DOMAIN, "Key must be loaded to discover path names")); return (-1); } else { (void) snprintf(di->errbuf, sizeof (di->errbuf), dgettext(TEXT_DOMAIN, "Unable to determine path or stats for " "object %lld in %s"), (longlong_t)obj, dsname); return (-1); } } /* * stream_bytes * * Prints a file name out a character at a time. If the character is * not in the range of what we consider "printable" ASCII, display it * as an escaped 4-digit octal value. ASCII values less than a space * are all control characters and we declare the upper end as the * DELete character. This also is the last 7-bit ASCII character. * We choose to treat all 8-bit ASCII as not printable for this * application. */ static void stream_bytes(FILE *fp, const char *string) { char c; while ((c = *string++) != '\0') { if (c > ' ' && c != '\\' && c < '\177') { (void) fputc(c, fp); } else { (void) fprintf(fp, "\\%04hho", (uint8_t)c); } } } static char get_what(mode_t what) { switch (what & S_IFMT) { case S_IFBLK: return ('B'); case S_IFCHR: return ('C'); case S_IFDIR: return ('/'); #ifdef S_IFDOOR case S_IFDOOR: return ('>'); #endif case S_IFIFO: return ('|'); case S_IFLNK: return ('@'); #ifdef S_IFPORT case S_IFPORT: return ('P'); #endif case S_IFSOCK: return ('='); case S_IFREG: return ('F'); default: return ('?'); } } static void print_cmn(FILE *fp, differ_info_t *di, const char *file) { if (!di->no_mangle) { stream_bytes(fp, di->dsmnt); stream_bytes(fp, file); } else { (void) fputs(di->dsmnt, fp); (void) fputs(file, fp); } } static void print_rename(FILE *fp, differ_info_t *di, const char *old, const char *new, zfs_stat_t *isb) { if (di->timestamped) (void) fprintf(fp, "%10lld.%09lld\t", (longlong_t)isb->zs_ctime[0], (longlong_t)isb->zs_ctime[1]); (void) fputs(ZDIFF_RENAMED "\t", fp); if (di->classify) (void) fprintf(fp, "%c\t", get_what(isb->zs_mode)); print_cmn(fp, di, old); (void) fputs(di->scripted ? "\t" : " -> ", fp); print_cmn(fp, di, new); (void) fputc('\n', fp); } static void print_link_change(FILE *fp, differ_info_t *di, int delta, const char *file, zfs_stat_t *isb) { if (di->timestamped) (void) fprintf(fp, "%10lld.%09lld\t", (longlong_t)isb->zs_ctime[0], (longlong_t)isb->zs_ctime[1]); (void) fputs(ZDIFF_MODIFIED "\t", fp); if (di->classify) (void) fprintf(fp, "%c\t", get_what(isb->zs_mode)); print_cmn(fp, di, file); (void) fprintf(fp, "\t(%+d)\n", delta); } static void print_file(FILE *fp, differ_info_t *di, char type, const char *file, zfs_stat_t *isb) { if (di->timestamped) (void) fprintf(fp, "%10lld.%09lld\t", (longlong_t)isb->zs_ctime[0], (longlong_t)isb->zs_ctime[1]); (void) fprintf(fp, "%c\t", type); if (di->classify) (void) fprintf(fp, "%c\t", get_what(isb->zs_mode)); print_cmn(fp, di, file); (void) fputc('\n', fp); } static int write_inuse_diffs_one(FILE *fp, differ_info_t *di, uint64_t dobj) { struct zfs_stat fsb, tsb; mode_t fmode, tmode; char fobjname[MAXPATHLEN], tobjname[MAXPATHLEN]; boolean_t already_logged = B_FALSE; int fobjerr, tobjerr; int change; if (dobj == di->shares) return (0); /* * Check the from and to snapshots for info on the object. If * we get ENOENT, then the object just didn't exist in that * snapshot. If we get ENOTSUP, then we tried to get * info on a non-ZPL object, which we don't care about anyway. * For any other error we print a warning which includes the * errno and continue. */ fobjerr = get_stats_for_obj(di, di->fromsnap, dobj, fobjname, MAXPATHLEN, &fsb); if (fobjerr && di->zerr != ENOTSUP && di->zerr != ENOENT) { zfs_error_aux(di->zhp->zfs_hdl, "%s", strerror(di->zerr)); zfs_error(di->zhp->zfs_hdl, di->zerr, di->errbuf); /* * Let's not print an error for the same object more than * once if it happens in both snapshots */ already_logged = B_TRUE; } tobjerr = get_stats_for_obj(di, di->tosnap, dobj, tobjname, MAXPATHLEN, &tsb); if (tobjerr && di->zerr != ENOTSUP && di->zerr != ENOENT) { if (!already_logged) { zfs_error_aux(di->zhp->zfs_hdl, "%s", strerror(di->zerr)); zfs_error(di->zhp->zfs_hdl, di->zerr, di->errbuf); } } /* * Unallocated object sharing the same meta dnode block */ if (fobjerr && tobjerr) { di->zerr = 0; return (0); } di->zerr = 0; /* negate get_stats_for_obj() from side that failed */ fmode = fsb.zs_mode & S_IFMT; tmode = tsb.zs_mode & S_IFMT; if (fmode == S_IFDIR || tmode == S_IFDIR || fsb.zs_links == 0 || tsb.zs_links == 0) change = 0; else change = tsb.zs_links - fsb.zs_links; if (fobjerr) { if (change) { print_link_change(fp, di, change, tobjname, &tsb); return (0); } print_file(fp, di, ZDIFF_ADDED, tobjname, &tsb); return (0); } else if (tobjerr) { if (change) { print_link_change(fp, di, change, fobjname, &fsb); return (0); } print_file(fp, di, ZDIFF_REMOVED, fobjname, &fsb); return (0); } if (fmode != tmode && fsb.zs_gen == tsb.zs_gen) tsb.zs_gen++; /* Force a generational difference */ /* Simple modification or no change */ if (fsb.zs_gen == tsb.zs_gen) { /* No apparent changes. Could we assert !this? */ if (fsb.zs_ctime[0] == tsb.zs_ctime[0] && fsb.zs_ctime[1] == tsb.zs_ctime[1]) return (0); if (change) { print_link_change(fp, di, change, change > 0 ? fobjname : tobjname, &tsb); } else if (strcmp(fobjname, tobjname) == 0) { print_file(fp, di, *ZDIFF_MODIFIED, fobjname, &tsb); } else { print_rename(fp, di, fobjname, tobjname, &tsb); } return (0); } else { /* file re-created or object re-used */ print_file(fp, di, ZDIFF_REMOVED, fobjname, &fsb); print_file(fp, di, ZDIFF_ADDED, tobjname, &tsb); return (0); } } static int write_inuse_diffs(FILE *fp, differ_info_t *di, dmu_diff_record_t *dr) { uint64_t o; int err; for (o = dr->ddr_first; o <= dr->ddr_last; o++) { if ((err = write_inuse_diffs_one(fp, di, o)) != 0) return (err); } return (0); } static int describe_free(FILE *fp, differ_info_t *di, uint64_t object, char *namebuf, int maxlen) { struct zfs_stat sb; (void) get_stats_for_obj(di, di->fromsnap, object, namebuf, maxlen, &sb); /* Don't print if in the delete queue on from side */ if (di->zerr == ESTALE || di->zerr == ENOENT) { di->zerr = 0; return (0); } print_file(fp, di, ZDIFF_REMOVED, namebuf, &sb); return (0); } static int write_free_diffs(FILE *fp, differ_info_t *di, dmu_diff_record_t *dr) { zfs_cmd_t zc = {"\0"}; libzfs_handle_t *lhdl = di->zhp->zfs_hdl; char fobjname[MAXPATHLEN]; (void) strlcpy(zc.zc_name, di->fromsnap, sizeof (zc.zc_name)); zc.zc_obj = dr->ddr_first - 1; ASSERT(di->zerr == 0); while (zc.zc_obj < dr->ddr_last) { int err; err = zfs_ioctl(lhdl, ZFS_IOC_NEXT_OBJ, &zc); if (err == 0) { if (zc.zc_obj == di->shares) { zc.zc_obj++; continue; } if (zc.zc_obj > dr->ddr_last) { break; } - err = describe_free(fp, di, zc.zc_obj, fobjname, + (void) describe_free(fp, di, zc.zc_obj, fobjname, MAXPATHLEN); } else if (errno == ESRCH) { break; } else { (void) snprintf(di->errbuf, sizeof (di->errbuf), dgettext(TEXT_DOMAIN, "next allocated object (> %lld) find failure"), (longlong_t)zc.zc_obj); di->zerr = errno; break; } } if (di->zerr) return (-1); return (0); } static void * differ(void *arg) { differ_info_t *di = arg; dmu_diff_record_t dr; FILE *ofp; int err = 0; if ((ofp = fdopen(di->outputfd, "w")) == NULL) { di->zerr = errno; strlcpy(di->errbuf, strerror(errno), sizeof (di->errbuf)); (void) close(di->datafd); return ((void *)-1); } for (;;) { char *cp = (char *)&dr; int len = sizeof (dr); int rv; do { rv = read(di->datafd, cp, len); cp += rv; len -= rv; } while (len > 0 && rv > 0); if (rv < 0 || (rv == 0 && len != sizeof (dr))) { di->zerr = EPIPE; break; } else if (rv == 0) { /* end of file at a natural breaking point */ break; } switch (dr.ddr_type) { case DDR_FREE: err = write_free_diffs(ofp, di, &dr); break; case DDR_INUSE: err = write_inuse_diffs(ofp, di, &dr); break; default: di->zerr = EPIPE; break; } if (err || di->zerr) break; } (void) fclose(ofp); (void) close(di->datafd); if (err) return ((void *)-1); if (di->zerr) { ASSERT(di->zerr == EPIPE); (void) snprintf(di->errbuf, sizeof (di->errbuf), dgettext(TEXT_DOMAIN, "Internal error: bad data from diff IOCTL")); return ((void *)-1); } return ((void *)0); } static int make_temp_snapshot(differ_info_t *di) { libzfs_handle_t *hdl = di->zhp->zfs_hdl; zfs_cmd_t zc = {"\0"}; (void) snprintf(zc.zc_value, sizeof (zc.zc_value), ZDIFF_PREFIX, getpid()); (void) strlcpy(zc.zc_name, di->ds, sizeof (zc.zc_name)); zc.zc_cleanup_fd = di->cleanupfd; if (zfs_ioctl(hdl, ZFS_IOC_TMP_SNAPSHOT, &zc) != 0) { int err = errno; if (err == EPERM) { (void) snprintf(di->errbuf, sizeof (di->errbuf), dgettext(TEXT_DOMAIN, "The diff delegated " "permission is needed in order\nto create a " "just-in-time snapshot for diffing\n")); return (zfs_error(hdl, EZFS_DIFF, di->errbuf)); } else { (void) snprintf(di->errbuf, sizeof (di->errbuf), dgettext(TEXT_DOMAIN, "Cannot create just-in-time " "snapshot of '%s'"), zc.zc_name); return (zfs_standard_error(hdl, err, di->errbuf)); } } di->tmpsnap = zfs_strdup(hdl, zc.zc_value); di->tosnap = zfs_asprintf(hdl, "%s@%s", di->ds, di->tmpsnap); return (0); } static void teardown_differ_info(differ_info_t *di) { free(di->ds); free(di->dsmnt); free(di->fromsnap); free(di->frommnt); free(di->tosnap); free(di->tmpsnap); free(di->tomnt); (void) close(di->cleanupfd); } static int get_snapshot_names(differ_info_t *di, const char *fromsnap, const char *tosnap) { libzfs_handle_t *hdl = di->zhp->zfs_hdl; char *atptrf = NULL; char *atptrt = NULL; int fdslen, fsnlen; int tdslen, tsnlen; /* * Can accept * fdslen fsnlen tdslen tsnlen * dataset@snap1 * 0. dataset@snap1 dataset@snap2 >0 >1 >0 >1 * 1. dataset@snap1 @snap2 >0 >1 ==0 >1 * 2. dataset@snap1 dataset >0 >1 >0 ==0 * 3. @snap1 dataset@snap2 ==0 >1 >0 >1 * 4. @snap1 dataset ==0 >1 >0 ==0 */ if (tosnap == NULL) { /* only a from snapshot given, must be valid */ (void) snprintf(di->errbuf, sizeof (di->errbuf), dgettext(TEXT_DOMAIN, "Badly formed snapshot name %s"), fromsnap); if (!zfs_validate_name(hdl, fromsnap, ZFS_TYPE_SNAPSHOT, B_FALSE)) { return (zfs_error(hdl, EZFS_INVALIDNAME, di->errbuf)); } atptrf = strchr(fromsnap, '@'); ASSERT(atptrf != NULL); fdslen = atptrf - fromsnap; di->fromsnap = zfs_strdup(hdl, fromsnap); di->ds = zfs_strdup(hdl, fromsnap); di->ds[fdslen] = '\0'; /* the to snap will be a just-in-time snap of the head */ return (make_temp_snapshot(di)); } (void) snprintf(di->errbuf, sizeof (di->errbuf), dgettext(TEXT_DOMAIN, "Unable to determine which snapshots to compare")); atptrf = strchr(fromsnap, '@'); atptrt = strchr(tosnap, '@'); fdslen = atptrf ? atptrf - fromsnap : strlen(fromsnap); tdslen = atptrt ? atptrt - tosnap : strlen(tosnap); fsnlen = strlen(fromsnap) - fdslen; /* includes @ sign */ tsnlen = strlen(tosnap) - tdslen; /* includes @ sign */ if (fsnlen <= 1 || tsnlen == 1 || (fdslen == 0 && tdslen == 0)) { return (zfs_error(hdl, EZFS_INVALIDNAME, di->errbuf)); } else if ((fdslen > 0 && tdslen > 0) && ((tdslen != fdslen || strncmp(fromsnap, tosnap, fdslen) != 0))) { /* * not the same dataset name, might be okay if * tosnap is a clone of a fromsnap descendant. */ char origin[ZFS_MAX_DATASET_NAME_LEN]; zprop_source_t src; zfs_handle_t *zhp; di->ds = zfs_alloc(di->zhp->zfs_hdl, tdslen + 1); (void) strlcpy(di->ds, tosnap, tdslen + 1); zhp = zfs_open(hdl, di->ds, ZFS_TYPE_FILESYSTEM); while (zhp != NULL) { if (zfs_prop_get(zhp, ZFS_PROP_ORIGIN, origin, sizeof (origin), &src, NULL, 0, B_FALSE) != 0) { (void) zfs_close(zhp); zhp = NULL; break; } if (strncmp(origin, fromsnap, fsnlen) == 0) break; (void) zfs_close(zhp); zhp = zfs_open(hdl, origin, ZFS_TYPE_FILESYSTEM); } if (zhp == NULL) { (void) snprintf(di->errbuf, sizeof (di->errbuf), dgettext(TEXT_DOMAIN, "Not an earlier snapshot from the same fs")); return (zfs_error(hdl, EZFS_INVALIDNAME, di->errbuf)); } else { (void) zfs_close(zhp); } di->isclone = B_TRUE; di->fromsnap = zfs_strdup(hdl, fromsnap); if (tsnlen) di->tosnap = zfs_strdup(hdl, tosnap); else return (make_temp_snapshot(di)); } else { int dslen = fdslen ? fdslen : tdslen; di->ds = zfs_alloc(hdl, dslen + 1); (void) strlcpy(di->ds, fdslen ? fromsnap : tosnap, dslen + 1); di->fromsnap = zfs_asprintf(hdl, "%s%s", di->ds, atptrf); if (tsnlen) { di->tosnap = zfs_asprintf(hdl, "%s%s", di->ds, atptrt); } else { return (make_temp_snapshot(di)); } } return (0); } static int get_mountpoint(differ_info_t *di, char *dsnm, char **mntpt) { boolean_t mounted; mounted = is_mounted(di->zhp->zfs_hdl, dsnm, mntpt); if (mounted == B_FALSE) { (void) snprintf(di->errbuf, sizeof (di->errbuf), dgettext(TEXT_DOMAIN, "Cannot diff an unmounted snapshot")); return (zfs_error(di->zhp->zfs_hdl, EZFS_BADTYPE, di->errbuf)); } /* Avoid a double slash at the beginning of root-mounted datasets */ if (**mntpt == '/' && *(*mntpt + 1) == '\0') **mntpt = '\0'; return (0); } static int get_mountpoints(differ_info_t *di) { char *strptr; char *frommntpt; /* * first get the mountpoint for the parent dataset */ if (get_mountpoint(di, di->ds, &di->dsmnt) != 0) return (-1); strptr = strchr(di->tosnap, '@'); ASSERT3P(strptr, !=, NULL); di->tomnt = zfs_asprintf(di->zhp->zfs_hdl, "%s%s%s", di->dsmnt, ZDIFF_SNAPDIR, ++strptr); strptr = strchr(di->fromsnap, '@'); ASSERT3P(strptr, !=, NULL); frommntpt = di->dsmnt; if (di->isclone) { char *mntpt; int err; *strptr = '\0'; err = get_mountpoint(di, di->fromsnap, &mntpt); *strptr = '@'; if (err != 0) return (-1); frommntpt = mntpt; } di->frommnt = zfs_asprintf(di->zhp->zfs_hdl, "%s%s%s", frommntpt, ZDIFF_SNAPDIR, ++strptr); if (di->isclone) free(frommntpt); return (0); } static int setup_differ_info(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap, differ_info_t *di) { di->zhp = zhp; di->cleanupfd = open(ZFS_DEV, O_RDWR | O_CLOEXEC); VERIFY(di->cleanupfd >= 0); if (get_snapshot_names(di, fromsnap, tosnap) != 0) return (-1); if (get_mountpoints(di) != 0) return (-1); if (find_shares_object(di) != 0) return (-1); return (0); } int zfs_show_diffs(zfs_handle_t *zhp, int outfd, const char *fromsnap, const char *tosnap, int flags) { zfs_cmd_t zc = {"\0"}; char errbuf[ERRBUFLEN]; differ_info_t di = { 0 }; pthread_t tid; int pipefd[2]; int iocerr; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "zfs diff failed")); if (setup_differ_info(zhp, fromsnap, tosnap, &di)) { teardown_differ_info(&di); return (-1); } if (pipe2(pipefd, O_CLOEXEC)) { zfs_error_aux(zhp->zfs_hdl, "%s", strerror(errno)); teardown_differ_info(&di); return (zfs_error(zhp->zfs_hdl, EZFS_PIPEFAILED, errbuf)); } di.scripted = (flags & ZFS_DIFF_PARSEABLE); di.classify = (flags & ZFS_DIFF_CLASSIFY); di.timestamped = (flags & ZFS_DIFF_TIMESTAMP); di.no_mangle = (flags & ZFS_DIFF_NO_MANGLE); di.outputfd = outfd; di.datafd = pipefd[0]; if (pthread_create(&tid, NULL, differ, &di)) { zfs_error_aux(zhp->zfs_hdl, "%s", strerror(errno)); (void) close(pipefd[0]); (void) close(pipefd[1]); teardown_differ_info(&di); return (zfs_error(zhp->zfs_hdl, EZFS_THREADCREATEFAILED, errbuf)); } /* do the ioctl() */ (void) strlcpy(zc.zc_value, di.fromsnap, strlen(di.fromsnap) + 1); (void) strlcpy(zc.zc_name, di.tosnap, strlen(di.tosnap) + 1); zc.zc_cookie = pipefd[1]; iocerr = zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_DIFF, &zc); if (iocerr != 0) { (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "Unable to obtain diffs")); if (errno == EPERM) { zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, "\n The sys_mount privilege or diff delegated " "permission is needed\n to execute the " "diff ioctl")); } else if (errno == EXDEV) { zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, "\n Not an earlier snapshot from the same fs")); } else if (errno != EPIPE || di.zerr == 0) { zfs_error_aux(zhp->zfs_hdl, "%s", strerror(errno)); } (void) close(pipefd[1]); (void) pthread_cancel(tid); (void) pthread_join(tid, NULL); teardown_differ_info(&di); if (di.zerr != 0 && di.zerr != EPIPE) { zfs_error_aux(zhp->zfs_hdl, "%s", strerror(di.zerr)); return (zfs_error(zhp->zfs_hdl, EZFS_DIFF, di.errbuf)); } else { return (zfs_error(zhp->zfs_hdl, EZFS_DIFFDATA, errbuf)); } } (void) close(pipefd[1]); (void) pthread_join(tid, NULL); if (di.zerr != 0) { zfs_error_aux(zhp->zfs_hdl, "%s", strerror(di.zerr)); return (zfs_error(zhp->zfs_hdl, EZFS_DIFF, di.errbuf)); } teardown_differ_info(&di); return (0); } diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index b9806dc30dac..c6f31d785b89 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -1,5218 +1,5217 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2015 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright 2016 Igor Kozhukhov * Copyright (c) 2018 Datto Inc. * Copyright (c) 2017 Open-E, Inc. All Rights Reserved. * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2018, loli10K * Copyright (c) 2021, Colm Buckley * Copyright (c) 2021, Klara Inc. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zfs_namecheck.h" #include "zfs_prop.h" #include "libzfs_impl.h" #include "zfs_comutil.h" #include "zfeature_common.h" static boolean_t zpool_vdev_is_interior(const char *name); typedef struct prop_flags { int create:1; /* Validate property on creation */ int import:1; /* Validate property on import */ int vdevprop:1; /* Validate property as a VDEV property */ } prop_flags_t; /* * ==================================================================== * zpool property functions * ==================================================================== */ static int zpool_get_all_props(zpool_handle_t *zhp) { zfs_cmd_t zc = {"\0"}; libzfs_handle_t *hdl = zhp->zpool_hdl; (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); zcmd_alloc_dst_nvlist(hdl, &zc, 0); while (zfs_ioctl(hdl, ZFS_IOC_POOL_GET_PROPS, &zc) != 0) { if (errno == ENOMEM) zcmd_expand_dst_nvlist(hdl, &zc); else { zcmd_free_nvlists(&zc); return (-1); } } if (zcmd_read_dst_nvlist(hdl, &zc, &zhp->zpool_props) != 0) { zcmd_free_nvlists(&zc); return (-1); } zcmd_free_nvlists(&zc); return (0); } int zpool_props_refresh(zpool_handle_t *zhp) { nvlist_t *old_props; old_props = zhp->zpool_props; if (zpool_get_all_props(zhp) != 0) return (-1); nvlist_free(old_props); return (0); } static const char * zpool_get_prop_string(zpool_handle_t *zhp, zpool_prop_t prop, zprop_source_t *src) { nvlist_t *nv, *nvl; const char *value; zprop_source_t source; nvl = zhp->zpool_props; if (nvlist_lookup_nvlist(nvl, zpool_prop_to_name(prop), &nv) == 0) { source = fnvlist_lookup_uint64(nv, ZPROP_SOURCE); value = fnvlist_lookup_string(nv, ZPROP_VALUE); } else { source = ZPROP_SRC_DEFAULT; if ((value = zpool_prop_default_string(prop)) == NULL) value = "-"; } if (src) *src = source; return (value); } uint64_t zpool_get_prop_int(zpool_handle_t *zhp, zpool_prop_t prop, zprop_source_t *src) { nvlist_t *nv, *nvl; uint64_t value; zprop_source_t source; if (zhp->zpool_props == NULL && zpool_get_all_props(zhp)) { /* * zpool_get_all_props() has most likely failed because * the pool is faulted, but if all we need is the top level * vdev's guid then get it from the zhp config nvlist. */ if ((prop == ZPOOL_PROP_GUID) && (nvlist_lookup_nvlist(zhp->zpool_config, ZPOOL_CONFIG_VDEV_TREE, &nv) == 0) && (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &value) == 0)) { return (value); } return (zpool_prop_default_numeric(prop)); } nvl = zhp->zpool_props; if (nvlist_lookup_nvlist(nvl, zpool_prop_to_name(prop), &nv) == 0) { source = fnvlist_lookup_uint64(nv, ZPROP_SOURCE); value = fnvlist_lookup_uint64(nv, ZPROP_VALUE); } else { source = ZPROP_SRC_DEFAULT; value = zpool_prop_default_numeric(prop); } if (src) *src = source; return (value); } /* * Map VDEV STATE to printed strings. */ const char * zpool_state_to_name(vdev_state_t state, vdev_aux_t aux) { switch (state) { case VDEV_STATE_CLOSED: case VDEV_STATE_OFFLINE: return (gettext("OFFLINE")); case VDEV_STATE_REMOVED: return (gettext("REMOVED")); case VDEV_STATE_CANT_OPEN: if (aux == VDEV_AUX_CORRUPT_DATA || aux == VDEV_AUX_BAD_LOG) return (gettext("FAULTED")); else if (aux == VDEV_AUX_SPLIT_POOL) return (gettext("SPLIT")); else return (gettext("UNAVAIL")); case VDEV_STATE_FAULTED: return (gettext("FAULTED")); case VDEV_STATE_DEGRADED: return (gettext("DEGRADED")); case VDEV_STATE_HEALTHY: return (gettext("ONLINE")); default: break; } return (gettext("UNKNOWN")); } /* * Map POOL STATE to printed strings. */ const char * zpool_pool_state_to_name(pool_state_t state) { switch (state) { default: break; case POOL_STATE_ACTIVE: return (gettext("ACTIVE")); case POOL_STATE_EXPORTED: return (gettext("EXPORTED")); case POOL_STATE_DESTROYED: return (gettext("DESTROYED")); case POOL_STATE_SPARE: return (gettext("SPARE")); case POOL_STATE_L2CACHE: return (gettext("L2CACHE")); case POOL_STATE_UNINITIALIZED: return (gettext("UNINITIALIZED")); case POOL_STATE_UNAVAIL: return (gettext("UNAVAIL")); case POOL_STATE_POTENTIALLY_ACTIVE: return (gettext("POTENTIALLY_ACTIVE")); } return (gettext("UNKNOWN")); } /* * Given a pool handle, return the pool health string ("ONLINE", "DEGRADED", * "SUSPENDED", etc). */ const char * zpool_get_state_str(zpool_handle_t *zhp) { zpool_errata_t errata; zpool_status_t status; const char *str; status = zpool_get_status(zhp, NULL, &errata); if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) { str = gettext("FAULTED"); } else if (status == ZPOOL_STATUS_IO_FAILURE_WAIT || status == ZPOOL_STATUS_IO_FAILURE_MMP) { str = gettext("SUSPENDED"); } else { nvlist_t *nvroot = fnvlist_lookup_nvlist( zpool_get_config(zhp, NULL), ZPOOL_CONFIG_VDEV_TREE); uint_t vsc; vdev_stat_t *vs = (vdev_stat_t *)fnvlist_lookup_uint64_array( nvroot, ZPOOL_CONFIG_VDEV_STATS, &vsc); str = zpool_state_to_name(vs->vs_state, vs->vs_aux); } return (str); } /* * Get a zpool property value for 'prop' and return the value in * a pre-allocated buffer. */ int zpool_get_prop(zpool_handle_t *zhp, zpool_prop_t prop, char *buf, size_t len, zprop_source_t *srctype, boolean_t literal) { uint64_t intval; const char *strval; zprop_source_t src = ZPROP_SRC_NONE; if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) { switch (prop) { case ZPOOL_PROP_NAME: (void) strlcpy(buf, zpool_get_name(zhp), len); break; case ZPOOL_PROP_HEALTH: (void) strlcpy(buf, zpool_get_state_str(zhp), len); break; case ZPOOL_PROP_GUID: intval = zpool_get_prop_int(zhp, prop, &src); (void) snprintf(buf, len, "%llu", (u_longlong_t)intval); break; case ZPOOL_PROP_ALTROOT: case ZPOOL_PROP_CACHEFILE: case ZPOOL_PROP_COMMENT: case ZPOOL_PROP_COMPATIBILITY: if (zhp->zpool_props != NULL || zpool_get_all_props(zhp) == 0) { (void) strlcpy(buf, zpool_get_prop_string(zhp, prop, &src), len); break; } zfs_fallthrough; default: (void) strlcpy(buf, "-", len); break; } if (srctype != NULL) *srctype = src; return (0); } if (zhp->zpool_props == NULL && zpool_get_all_props(zhp) && prop != ZPOOL_PROP_NAME) return (-1); switch (zpool_prop_get_type(prop)) { case PROP_TYPE_STRING: (void) strlcpy(buf, zpool_get_prop_string(zhp, prop, &src), len); break; case PROP_TYPE_NUMBER: intval = zpool_get_prop_int(zhp, prop, &src); switch (prop) { case ZPOOL_PROP_SIZE: case ZPOOL_PROP_ALLOCATED: case ZPOOL_PROP_FREE: case ZPOOL_PROP_FREEING: case ZPOOL_PROP_LEAKED: case ZPOOL_PROP_ASHIFT: case ZPOOL_PROP_MAXBLOCKSIZE: case ZPOOL_PROP_MAXDNODESIZE: if (literal) (void) snprintf(buf, len, "%llu", (u_longlong_t)intval); else (void) zfs_nicenum(intval, buf, len); break; case ZPOOL_PROP_EXPANDSZ: case ZPOOL_PROP_CHECKPOINT: if (intval == 0) { (void) strlcpy(buf, "-", len); } else if (literal) { (void) snprintf(buf, len, "%llu", (u_longlong_t)intval); } else { (void) zfs_nicebytes(intval, buf, len); } break; case ZPOOL_PROP_CAPACITY: if (literal) { (void) snprintf(buf, len, "%llu", (u_longlong_t)intval); } else { (void) snprintf(buf, len, "%llu%%", (u_longlong_t)intval); } break; case ZPOOL_PROP_FRAGMENTATION: if (intval == UINT64_MAX) { (void) strlcpy(buf, "-", len); } else if (literal) { (void) snprintf(buf, len, "%llu", (u_longlong_t)intval); } else { (void) snprintf(buf, len, "%llu%%", (u_longlong_t)intval); } break; case ZPOOL_PROP_DEDUPRATIO: if (literal) (void) snprintf(buf, len, "%llu.%02llu", (u_longlong_t)(intval / 100), (u_longlong_t)(intval % 100)); else (void) snprintf(buf, len, "%llu.%02llux", (u_longlong_t)(intval / 100), (u_longlong_t)(intval % 100)); break; case ZPOOL_PROP_HEALTH: (void) strlcpy(buf, zpool_get_state_str(zhp), len); break; case ZPOOL_PROP_VERSION: if (intval >= SPA_VERSION_FEATURES) { (void) snprintf(buf, len, "-"); break; } zfs_fallthrough; default: (void) snprintf(buf, len, "%llu", (u_longlong_t)intval); } break; case PROP_TYPE_INDEX: intval = zpool_get_prop_int(zhp, prop, &src); if (zpool_prop_index_to_string(prop, intval, &strval) != 0) return (-1); (void) strlcpy(buf, strval, len); break; default: abort(); } if (srctype) *srctype = src; return (0); } /* * Check if the bootfs name has the same pool name as it is set to. * Assuming bootfs is a valid dataset name. */ static boolean_t bootfs_name_valid(const char *pool, const char *bootfs) { int len = strlen(pool); if (bootfs[0] == '\0') return (B_TRUE); if (!zfs_name_valid(bootfs, ZFS_TYPE_FILESYSTEM|ZFS_TYPE_SNAPSHOT)) return (B_FALSE); if (strncmp(pool, bootfs, len) == 0 && (bootfs[len] == '/' || bootfs[len] == '\0')) return (B_TRUE); return (B_FALSE); } /* * Given an nvlist of zpool properties to be set, validate that they are * correct, and parse any numeric properties (index, boolean, etc) if they are * specified as strings. */ static nvlist_t * zpool_valid_proplist(libzfs_handle_t *hdl, const char *poolname, nvlist_t *props, uint64_t version, prop_flags_t flags, char *errbuf) { nvpair_t *elem; nvlist_t *retprops; zpool_prop_t prop; char *strval; uint64_t intval; char *slash, *check; struct stat64 statbuf; zpool_handle_t *zhp; char report[1024]; if (nvlist_alloc(&retprops, NV_UNIQUE_NAME, 0) != 0) { (void) no_memory(hdl); return (NULL); } elem = NULL; while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { const char *propname = nvpair_name(elem); if (flags.vdevprop && zpool_prop_vdev(propname)) { vdev_prop_t vprop = vdev_name_to_prop(propname); if (vdev_prop_readonly(vprop)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' " "is readonly"), propname); (void) zfs_error(hdl, EZFS_PROPREADONLY, errbuf); goto error; } if (zprop_parse_value(hdl, elem, vprop, ZFS_TYPE_VDEV, retprops, &strval, &intval, errbuf) != 0) goto error; continue; } else if (flags.vdevprop && vdev_prop_user(propname)) { if (nvlist_add_nvpair(retprops, elem) != 0) { (void) no_memory(hdl); goto error; } continue; } else if (flags.vdevprop) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid property: '%s'"), propname); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } prop = zpool_name_to_prop(propname); if (prop == ZPOOL_PROP_INVAL && zpool_prop_feature(propname)) { int err; char *fname = strchr(propname, '@') + 1; err = zfeature_lookup_name(fname, NULL); if (err != 0) { ASSERT3U(err, ==, ENOENT); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "feature '%s' unsupported by kernel"), fname); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } if (nvpair_type(elem) != DATA_TYPE_STRING) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' must be a string"), propname); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } (void) nvpair_value_string(elem, &strval); if (strcmp(strval, ZFS_FEATURE_ENABLED) != 0 && strcmp(strval, ZFS_FEATURE_DISABLED) != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "property '%s' can only be set to " "'enabled' or 'disabled'"), propname); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } if (!flags.create && strcmp(strval, ZFS_FEATURE_DISABLED) == 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "property '%s' can only be set to " "'disabled' at creation time"), propname); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } if (nvlist_add_uint64(retprops, propname, 0) != 0) { (void) no_memory(hdl); goto error; } continue; } /* * Make sure this property is valid and applies to this type. */ if (prop == ZPOOL_PROP_INVAL) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid property '%s'"), propname); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } if (zpool_prop_readonly(prop)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' " "is readonly"), propname); (void) zfs_error(hdl, EZFS_PROPREADONLY, errbuf); goto error; } if (!flags.create && zpool_prop_setonce(prop)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "property '%s' can only be set at " "creation time"), propname); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } if (zprop_parse_value(hdl, elem, prop, ZFS_TYPE_POOL, retprops, &strval, &intval, errbuf) != 0) goto error; /* * Perform additional checking for specific properties. */ switch (prop) { case ZPOOL_PROP_VERSION: if (intval < version || !SPA_VERSION_IS_SUPPORTED(intval)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "property '%s' number %llu is invalid."), propname, (unsigned long long)intval); (void) zfs_error(hdl, EZFS_BADVERSION, errbuf); goto error; } break; case ZPOOL_PROP_ASHIFT: if (intval != 0 && (intval < ASHIFT_MIN || intval > ASHIFT_MAX)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "property '%s' number %llu is invalid, " "only values between %" PRId32 " and %" PRId32 " are allowed."), propname, (unsigned long long)intval, ASHIFT_MIN, ASHIFT_MAX); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } break; case ZPOOL_PROP_BOOTFS: if (flags.create || flags.import) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "property '%s' cannot be set at creation " "or import time"), propname); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } if (version < SPA_VERSION_BOOTFS) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool must be upgraded to support " "'%s' property"), propname); (void) zfs_error(hdl, EZFS_BADVERSION, errbuf); goto error; } /* * bootfs property value has to be a dataset name and * the dataset has to be in the same pool as it sets to. */ if (!bootfs_name_valid(poolname, strval)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' " "is an invalid name"), strval); (void) zfs_error(hdl, EZFS_INVALIDNAME, errbuf); goto error; } if ((zhp = zpool_open_canfail(hdl, poolname)) == NULL) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "could not open pool '%s'"), poolname); (void) zfs_error(hdl, EZFS_OPENFAILED, errbuf); goto error; } zpool_close(zhp); break; case ZPOOL_PROP_ALTROOT: if (!flags.create && !flags.import) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "property '%s' can only be set during pool " "creation or import"), propname); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } if (strval[0] != '/') { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "bad alternate root '%s'"), strval); (void) zfs_error(hdl, EZFS_BADPATH, errbuf); goto error; } break; case ZPOOL_PROP_CACHEFILE: if (strval[0] == '\0') break; if (strcmp(strval, "none") == 0) break; if (strval[0] != '/') { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "property '%s' must be empty, an " "absolute path, or 'none'"), propname); (void) zfs_error(hdl, EZFS_BADPATH, errbuf); goto error; } slash = strrchr(strval, '/'); if (slash[1] == '\0' || strcmp(slash, "/.") == 0 || strcmp(slash, "/..") == 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' is not a valid file"), strval); (void) zfs_error(hdl, EZFS_BADPATH, errbuf); goto error; } *slash = '\0'; if (strval[0] != '\0' && (stat64(strval, &statbuf) != 0 || !S_ISDIR(statbuf.st_mode))) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' is not a valid directory"), strval); (void) zfs_error(hdl, EZFS_BADPATH, errbuf); goto error; } *slash = '/'; break; case ZPOOL_PROP_COMPATIBILITY: switch (zpool_load_compat(strval, NULL, report, 1024)) { case ZPOOL_COMPATIBILITY_OK: case ZPOOL_COMPATIBILITY_WARNTOKEN: break; case ZPOOL_COMPATIBILITY_BADFILE: case ZPOOL_COMPATIBILITY_BADTOKEN: case ZPOOL_COMPATIBILITY_NOFILES: zfs_error_aux(hdl, "%s", report); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } break; case ZPOOL_PROP_COMMENT: for (check = strval; *check != '\0'; check++) { if (!isprint(*check)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "comment may only have printable " "characters")); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } } if (strlen(strval) > ZPROP_MAX_COMMENT) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "comment must not exceed %d characters"), ZPROP_MAX_COMMENT); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } break; case ZPOOL_PROP_READONLY: if (!flags.import) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "property '%s' can only be set at " "import time"), propname); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } break; case ZPOOL_PROP_MULTIHOST: if (get_system_hostid() == 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "requires a non-zero system hostid")); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } break; case ZPOOL_PROP_DEDUPDITTO: printf("Note: property '%s' no longer has " "any effect\n", propname); break; default: break; } } return (retprops); error: nvlist_free(retprops); return (NULL); } /* * Set zpool property : propname=propval. */ int zpool_set_prop(zpool_handle_t *zhp, const char *propname, const char *propval) { zfs_cmd_t zc = {"\0"}; int ret = -1; char errbuf[ERRBUFLEN]; nvlist_t *nvl = NULL; nvlist_t *realprops; uint64_t version; prop_flags_t flags = { 0 }; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot set property for '%s'"), zhp->zpool_name); if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0) return (no_memory(zhp->zpool_hdl)); if (nvlist_add_string(nvl, propname, propval) != 0) { nvlist_free(nvl); return (no_memory(zhp->zpool_hdl)); } version = zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL); if ((realprops = zpool_valid_proplist(zhp->zpool_hdl, zhp->zpool_name, nvl, version, flags, errbuf)) == NULL) { nvlist_free(nvl); return (-1); } nvlist_free(nvl); nvl = realprops; /* * Execute the corresponding ioctl() to set this property. */ (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); zcmd_write_src_nvlist(zhp->zpool_hdl, &zc, nvl); ret = zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_POOL_SET_PROPS, &zc); zcmd_free_nvlists(&zc); nvlist_free(nvl); if (ret) (void) zpool_standard_error(zhp->zpool_hdl, errno, errbuf); else (void) zpool_props_refresh(zhp); return (ret); } int zpool_expand_proplist(zpool_handle_t *zhp, zprop_list_t **plp, zfs_type_t type, boolean_t literal) { libzfs_handle_t *hdl = zhp->zpool_hdl; zprop_list_t *entry; char buf[ZFS_MAXPROPLEN]; nvlist_t *features = NULL; nvpair_t *nvp; zprop_list_t **last; boolean_t firstexpand = (NULL == *plp); int i; if (zprop_expand_list(hdl, plp, type) != 0) return (-1); if (type == ZFS_TYPE_VDEV) return (0); last = plp; while (*last != NULL) last = &(*last)->pl_next; if ((*plp)->pl_all) features = zpool_get_features(zhp); if ((*plp)->pl_all && firstexpand) { for (i = 0; i < SPA_FEATURES; i++) { zprop_list_t *entry = zfs_alloc(hdl, sizeof (zprop_list_t)); entry->pl_prop = ZPROP_USERPROP; entry->pl_user_prop = zfs_asprintf(hdl, "feature@%s", spa_feature_table[i].fi_uname); entry->pl_width = strlen(entry->pl_user_prop); entry->pl_all = B_TRUE; *last = entry; last = &entry->pl_next; } } /* add any unsupported features */ for (nvp = nvlist_next_nvpair(features, NULL); nvp != NULL; nvp = nvlist_next_nvpair(features, nvp)) { char *propname; boolean_t found; zprop_list_t *entry; if (zfeature_is_supported(nvpair_name(nvp))) continue; propname = zfs_asprintf(hdl, "unsupported@%s", nvpair_name(nvp)); /* * Before adding the property to the list make sure that no * other pool already added the same property. */ found = B_FALSE; entry = *plp; while (entry != NULL) { if (entry->pl_user_prop != NULL && strcmp(propname, entry->pl_user_prop) == 0) { found = B_TRUE; break; } entry = entry->pl_next; } if (found) { free(propname); continue; } entry = zfs_alloc(hdl, sizeof (zprop_list_t)); entry->pl_prop = ZPROP_USERPROP; entry->pl_user_prop = propname; entry->pl_width = strlen(entry->pl_user_prop); entry->pl_all = B_TRUE; *last = entry; last = &entry->pl_next; } for (entry = *plp; entry != NULL; entry = entry->pl_next) { if (entry->pl_fixed && !literal) continue; if (entry->pl_prop != ZPROP_USERPROP && zpool_get_prop(zhp, entry->pl_prop, buf, sizeof (buf), NULL, literal) == 0) { if (strlen(buf) > entry->pl_width) entry->pl_width = strlen(buf); } } return (0); } int vdev_expand_proplist(zpool_handle_t *zhp, const char *vdevname, zprop_list_t **plp) { zprop_list_t *entry; char buf[ZFS_MAXPROPLEN]; char *strval = NULL; int err = 0; nvpair_t *elem = NULL; nvlist_t *vprops = NULL; nvlist_t *propval = NULL; const char *propname; vdev_prop_t prop; zprop_list_t **last; for (entry = *plp; entry != NULL; entry = entry->pl_next) { if (entry->pl_fixed) continue; if (zpool_get_vdev_prop(zhp, vdevname, entry->pl_prop, entry->pl_user_prop, buf, sizeof (buf), NULL, B_FALSE) == 0) { if (strlen(buf) > entry->pl_width) entry->pl_width = strlen(buf); } if (entry->pl_prop == VDEV_PROP_NAME && strlen(vdevname) > entry->pl_width) entry->pl_width = strlen(vdevname); } /* Handle the all properties case */ last = plp; if (*last != NULL && (*last)->pl_all == B_TRUE) { while (*last != NULL) last = &(*last)->pl_next; err = zpool_get_all_vdev_props(zhp, vdevname, &vprops); if (err != 0) return (err); while ((elem = nvlist_next_nvpair(vprops, elem)) != NULL) { propname = nvpair_name(elem); /* Skip properties that are not user defined */ if ((prop = vdev_name_to_prop(propname)) != VDEV_PROP_USERPROP) continue; if (nvpair_value_nvlist(elem, &propval) != 0) continue; strval = fnvlist_lookup_string(propval, ZPROP_VALUE); entry = zfs_alloc(zhp->zpool_hdl, sizeof (zprop_list_t)); entry->pl_prop = prop; entry->pl_user_prop = zfs_strdup(zhp->zpool_hdl, propname); entry->pl_width = strlen(strval); entry->pl_all = B_TRUE; *last = entry; last = &entry->pl_next; } } return (0); } /* * Get the state for the given feature on the given ZFS pool. */ int zpool_prop_get_feature(zpool_handle_t *zhp, const char *propname, char *buf, size_t len) { uint64_t refcount; boolean_t found = B_FALSE; nvlist_t *features = zpool_get_features(zhp); boolean_t supported; const char *feature = strchr(propname, '@') + 1; supported = zpool_prop_feature(propname); ASSERT(supported || zpool_prop_unsupported(propname)); /* * Convert from feature name to feature guid. This conversion is * unnecessary for unsupported@... properties because they already * use guids. */ if (supported) { int ret; spa_feature_t fid; ret = zfeature_lookup_name(feature, &fid); if (ret != 0) { (void) strlcpy(buf, "-", len); return (ENOTSUP); } feature = spa_feature_table[fid].fi_guid; } if (nvlist_lookup_uint64(features, feature, &refcount) == 0) found = B_TRUE; if (supported) { if (!found) { (void) strlcpy(buf, ZFS_FEATURE_DISABLED, len); } else { if (refcount == 0) (void) strlcpy(buf, ZFS_FEATURE_ENABLED, len); else (void) strlcpy(buf, ZFS_FEATURE_ACTIVE, len); } } else { if (found) { if (refcount == 0) { (void) strcpy(buf, ZFS_UNSUPPORTED_INACTIVE); } else { (void) strcpy(buf, ZFS_UNSUPPORTED_READONLY); } } else { (void) strlcpy(buf, "-", len); return (ENOTSUP); } } return (0); } /* * Validate the given pool name, optionally putting an extended error message in * 'buf'. */ boolean_t zpool_name_valid(libzfs_handle_t *hdl, boolean_t isopen, const char *pool) { namecheck_err_t why; char what; int ret; ret = pool_namecheck(pool, &why, &what); /* * The rules for reserved pool names were extended at a later point. * But we need to support users with existing pools that may now be * invalid. So we only check for this expanded set of names during a * create (or import), and only in userland. */ if (ret == 0 && !isopen && (strncmp(pool, "mirror", 6) == 0 || strncmp(pool, "raidz", 5) == 0 || strncmp(pool, "draid", 5) == 0 || strncmp(pool, "spare", 5) == 0 || strcmp(pool, "log") == 0)) { if (hdl != NULL) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "name is reserved")); return (B_FALSE); } if (ret != 0) { if (hdl != NULL) { switch (why) { case NAME_ERR_TOOLONG: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "name is too long")); break; case NAME_ERR_INVALCHAR: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid character " "'%c' in pool name"), what); break; case NAME_ERR_NOLETTER: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "name must begin with a letter")); break; case NAME_ERR_RESERVED: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "name is reserved")); break; case NAME_ERR_DISKLIKE: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool name is reserved")); break; case NAME_ERR_LEADING_SLASH: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "leading slash in name")); break; case NAME_ERR_EMPTY_COMPONENT: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "empty component in name")); break; case NAME_ERR_TRAILING_SLASH: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "trailing slash in name")); break; case NAME_ERR_MULTIPLE_DELIMITERS: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "multiple '@' and/or '#' delimiters in " "name")); break; case NAME_ERR_NO_AT: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "permission set is missing '@'")); break; default: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "(%d) not defined"), why); break; } } return (B_FALSE); } return (B_TRUE); } /* * Open a handle to the given pool, even if the pool is currently in the FAULTED * state. */ zpool_handle_t * zpool_open_canfail(libzfs_handle_t *hdl, const char *pool) { zpool_handle_t *zhp; boolean_t missing; /* * Make sure the pool name is valid. */ if (!zpool_name_valid(hdl, B_TRUE, pool)) { (void) zfs_error_fmt(hdl, EZFS_INVALIDNAME, dgettext(TEXT_DOMAIN, "cannot open '%s'"), pool); return (NULL); } zhp = zfs_alloc(hdl, sizeof (zpool_handle_t)); zhp->zpool_hdl = hdl; (void) strlcpy(zhp->zpool_name, pool, sizeof (zhp->zpool_name)); if (zpool_refresh_stats(zhp, &missing) != 0) { zpool_close(zhp); return (NULL); } if (missing) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "no such pool")); (void) zfs_error_fmt(hdl, EZFS_NOENT, dgettext(TEXT_DOMAIN, "cannot open '%s'"), pool); zpool_close(zhp); return (NULL); } return (zhp); } /* * Like the above, but silent on error. Used when iterating over pools (because * the configuration cache may be out of date). */ int zpool_open_silent(libzfs_handle_t *hdl, const char *pool, zpool_handle_t **ret) { zpool_handle_t *zhp; boolean_t missing; zhp = zfs_alloc(hdl, sizeof (zpool_handle_t)); zhp->zpool_hdl = hdl; (void) strlcpy(zhp->zpool_name, pool, sizeof (zhp->zpool_name)); if (zpool_refresh_stats(zhp, &missing) != 0) { zpool_close(zhp); return (-1); } if (missing) { zpool_close(zhp); *ret = NULL; return (0); } *ret = zhp; return (0); } /* * Similar to zpool_open_canfail(), but refuses to open pools in the faulted * state. */ zpool_handle_t * zpool_open(libzfs_handle_t *hdl, const char *pool) { zpool_handle_t *zhp; if ((zhp = zpool_open_canfail(hdl, pool)) == NULL) return (NULL); if (zhp->zpool_state == POOL_STATE_UNAVAIL) { (void) zfs_error_fmt(hdl, EZFS_POOLUNAVAIL, dgettext(TEXT_DOMAIN, "cannot open '%s'"), zhp->zpool_name); zpool_close(zhp); return (NULL); } return (zhp); } /* * Close the handle. Simply frees the memory associated with the handle. */ void zpool_close(zpool_handle_t *zhp) { nvlist_free(zhp->zpool_config); nvlist_free(zhp->zpool_old_config); nvlist_free(zhp->zpool_props); free(zhp); } /* * Return the name of the pool. */ const char * zpool_get_name(zpool_handle_t *zhp) { return (zhp->zpool_name); } /* * Return the state of the pool (ACTIVE or UNAVAILABLE) */ int zpool_get_state(zpool_handle_t *zhp) { return (zhp->zpool_state); } /* * Check if vdev list contains a special vdev */ static boolean_t zpool_has_special_vdev(nvlist_t *nvroot) { nvlist_t **child; uint_t children; if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &child, &children) == 0) { for (uint_t c = 0; c < children; c++) { char *bias; if (nvlist_lookup_string(child[c], ZPOOL_CONFIG_ALLOCATION_BIAS, &bias) == 0 && strcmp(bias, VDEV_ALLOC_BIAS_SPECIAL) == 0) { return (B_TRUE); } } } return (B_FALSE); } /* * Check if vdev list contains a dRAID vdev */ static boolean_t zpool_has_draid_vdev(nvlist_t *nvroot) { nvlist_t **child; uint_t children; if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &child, &children) == 0) { for (uint_t c = 0; c < children; c++) { char *type; if (nvlist_lookup_string(child[c], ZPOOL_CONFIG_TYPE, &type) == 0 && strcmp(type, VDEV_TYPE_DRAID) == 0) { return (B_TRUE); } } } return (B_FALSE); } /* * Output a dRAID top-level vdev name in to the provided buffer. */ static char * zpool_draid_name(char *name, int len, uint64_t data, uint64_t parity, uint64_t spares, uint64_t children) { snprintf(name, len, "%s%llu:%llud:%lluc:%llus", VDEV_TYPE_DRAID, (u_longlong_t)parity, (u_longlong_t)data, (u_longlong_t)children, (u_longlong_t)spares); return (name); } /* * Return B_TRUE if the provided name is a dRAID spare name. */ boolean_t zpool_is_draid_spare(const char *name) { uint64_t spare_id, parity, vdev_id; if (sscanf(name, VDEV_TYPE_DRAID "%llu-%llu-%llu", (u_longlong_t *)&parity, (u_longlong_t *)&vdev_id, (u_longlong_t *)&spare_id) == 3) { return (B_TRUE); } return (B_FALSE); } /* * Create the named pool, using the provided vdev list. It is assumed * that the consumer has already validated the contents of the nvlist, so we * don't have to worry about error semantics. */ int zpool_create(libzfs_handle_t *hdl, const char *pool, nvlist_t *nvroot, nvlist_t *props, nvlist_t *fsprops) { zfs_cmd_t zc = {"\0"}; nvlist_t *zc_fsprops = NULL; nvlist_t *zc_props = NULL; nvlist_t *hidden_args = NULL; uint8_t *wkeydata = NULL; uint_t wkeylen = 0; char errbuf[ERRBUFLEN]; int ret = -1; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot create '%s'"), pool); if (!zpool_name_valid(hdl, B_FALSE, pool)) return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); zcmd_write_conf_nvlist(hdl, &zc, nvroot); if (props) { prop_flags_t flags = { .create = B_TRUE, .import = B_FALSE }; if ((zc_props = zpool_valid_proplist(hdl, pool, props, SPA_VERSION_1, flags, errbuf)) == NULL) { goto create_failed; } } if (fsprops) { uint64_t zoned; char *zonestr; zoned = ((nvlist_lookup_string(fsprops, zfs_prop_to_name(ZFS_PROP_ZONED), &zonestr) == 0) && strcmp(zonestr, "on") == 0); if ((zc_fsprops = zfs_valid_proplist(hdl, ZFS_TYPE_FILESYSTEM, fsprops, zoned, NULL, NULL, B_TRUE, errbuf)) == NULL) { goto create_failed; } if (nvlist_exists(zc_fsprops, zfs_prop_to_name(ZFS_PROP_SPECIAL_SMALL_BLOCKS)) && !zpool_has_special_vdev(nvroot)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "%s property requires a special vdev"), zfs_prop_to_name(ZFS_PROP_SPECIAL_SMALL_BLOCKS)); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto create_failed; } if (!zc_props && (nvlist_alloc(&zc_props, NV_UNIQUE_NAME, 0) != 0)) { goto create_failed; } if (zfs_crypto_create(hdl, NULL, zc_fsprops, props, B_TRUE, &wkeydata, &wkeylen) != 0) { zfs_error(hdl, EZFS_CRYPTOFAILED, errbuf); goto create_failed; } if (nvlist_add_nvlist(zc_props, ZPOOL_ROOTFS_PROPS, zc_fsprops) != 0) { goto create_failed; } if (wkeydata != NULL) { if (nvlist_alloc(&hidden_args, NV_UNIQUE_NAME, 0) != 0) goto create_failed; if (nvlist_add_uint8_array(hidden_args, "wkeydata", wkeydata, wkeylen) != 0) goto create_failed; if (nvlist_add_nvlist(zc_props, ZPOOL_HIDDEN_ARGS, hidden_args) != 0) goto create_failed; } } if (zc_props) zcmd_write_src_nvlist(hdl, &zc, zc_props); (void) strlcpy(zc.zc_name, pool, sizeof (zc.zc_name)); if ((ret = zfs_ioctl(hdl, ZFS_IOC_POOL_CREATE, &zc)) != 0) { zcmd_free_nvlists(&zc); nvlist_free(zc_props); nvlist_free(zc_fsprops); nvlist_free(hidden_args); if (wkeydata != NULL) free(wkeydata); switch (errno) { case EBUSY: /* * This can happen if the user has specified the same * device multiple times. We can't reliably detect this * until we try to add it and see we already have a * label. This can also happen under if the device is * part of an active md or lvm device. */ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "one or more vdevs refer to the same device, or " "one of\nthe devices is part of an active md or " "lvm device")); return (zfs_error(hdl, EZFS_BADDEV, errbuf)); case ERANGE: /* * This happens if the record size is smaller or larger * than the allowed size range, or not a power of 2. * * NOTE: although zfs_valid_proplist is called earlier, * this case may have slipped through since the * pool does not exist yet and it is therefore * impossible to read properties e.g. max blocksize * from the pool. */ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "record size invalid")); return (zfs_error(hdl, EZFS_BADPROP, errbuf)); case EOVERFLOW: /* * This occurs when one of the devices is below * SPA_MINDEVSIZE. Unfortunately, we can't detect which * device was the problem device since there's no * reliable way to determine device size from userland. */ { char buf[64]; zfs_nicebytes(SPA_MINDEVSIZE, buf, sizeof (buf)); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "one or more devices is less than the " "minimum size (%s)"), buf); } return (zfs_error(hdl, EZFS_BADDEV, errbuf)); case ENOSPC: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "one or more devices is out of space")); return (zfs_error(hdl, EZFS_BADDEV, errbuf)); case EINVAL: if (zpool_has_draid_vdev(nvroot) && zfeature_lookup_name("draid", NULL) != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "dRAID vdevs are unsupported by the " "kernel")); return (zfs_error(hdl, EZFS_BADDEV, errbuf)); } else { return (zpool_standard_error(hdl, errno, errbuf)); } default: return (zpool_standard_error(hdl, errno, errbuf)); } } create_failed: zcmd_free_nvlists(&zc); nvlist_free(zc_props); nvlist_free(zc_fsprops); nvlist_free(hidden_args); if (wkeydata != NULL) free(wkeydata); return (ret); } /* * Destroy the given pool. It is up to the caller to ensure that there are no * datasets left in the pool. */ int zpool_destroy(zpool_handle_t *zhp, const char *log_str) { zfs_cmd_t zc = {"\0"}; zfs_handle_t *zfp = NULL; libzfs_handle_t *hdl = zhp->zpool_hdl; char errbuf[ERRBUFLEN]; if (zhp->zpool_state == POOL_STATE_ACTIVE && (zfp = zfs_open(hdl, zhp->zpool_name, ZFS_TYPE_FILESYSTEM)) == NULL) return (-1); (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); zc.zc_history = (uint64_t)(uintptr_t)log_str; if (zfs_ioctl(hdl, ZFS_IOC_POOL_DESTROY, &zc) != 0) { (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot destroy '%s'"), zhp->zpool_name); if (errno == EROFS) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "one or more devices is read only")); (void) zfs_error(hdl, EZFS_BADDEV, errbuf); } else { (void) zpool_standard_error(hdl, errno, errbuf); } if (zfp) zfs_close(zfp); return (-1); } if (zfp) { remove_mountpoint(zfp); zfs_close(zfp); } return (0); } /* * Create a checkpoint in the given pool. */ int zpool_checkpoint(zpool_handle_t *zhp) { libzfs_handle_t *hdl = zhp->zpool_hdl; char errbuf[ERRBUFLEN]; int error; error = lzc_pool_checkpoint(zhp->zpool_name); if (error != 0) { (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot checkpoint '%s'"), zhp->zpool_name); (void) zpool_standard_error(hdl, error, errbuf); return (-1); } return (0); } /* * Discard the checkpoint from the given pool. */ int zpool_discard_checkpoint(zpool_handle_t *zhp) { libzfs_handle_t *hdl = zhp->zpool_hdl; char errbuf[ERRBUFLEN]; int error; error = lzc_pool_checkpoint_discard(zhp->zpool_name); if (error != 0) { (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot discard checkpoint in '%s'"), zhp->zpool_name); (void) zpool_standard_error(hdl, error, errbuf); return (-1); } return (0); } /* * Add the given vdevs to the pool. The caller must have already performed the * necessary verification to ensure that the vdev specification is well-formed. */ int zpool_add(zpool_handle_t *zhp, nvlist_t *nvroot) { zfs_cmd_t zc = {"\0"}; int ret; libzfs_handle_t *hdl = zhp->zpool_hdl; char errbuf[ERRBUFLEN]; nvlist_t **spares, **l2cache; uint_t nspares, nl2cache; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot add to '%s'"), zhp->zpool_name); if (zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL) < SPA_VERSION_SPARES && nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool must be " "upgraded to add hot spares")); return (zfs_error(hdl, EZFS_BADVERSION, errbuf)); } if (zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL) < SPA_VERSION_L2CACHE && nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool must be " "upgraded to add cache devices")); return (zfs_error(hdl, EZFS_BADVERSION, errbuf)); } zcmd_write_conf_nvlist(hdl, &zc, nvroot); (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); if (zfs_ioctl(hdl, ZFS_IOC_VDEV_ADD, &zc) != 0) { switch (errno) { case EBUSY: /* * This can happen if the user has specified the same * device multiple times. We can't reliably detect this * until we try to add it and see we already have a * label. */ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "one or more vdevs refer to the same device")); (void) zfs_error(hdl, EZFS_BADDEV, errbuf); break; case EINVAL: if (zpool_has_draid_vdev(nvroot) && zfeature_lookup_name("draid", NULL) != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "dRAID vdevs are unsupported by the " "kernel")); } else { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid config; a pool with removing/" "removed vdevs does not support adding " "raidz or dRAID vdevs")); } (void) zfs_error(hdl, EZFS_BADDEV, errbuf); break; case EOVERFLOW: /* * This occurs when one of the devices is below * SPA_MINDEVSIZE. Unfortunately, we can't detect which * device was the problem device since there's no * reliable way to determine device size from userland. */ { char buf[64]; zfs_nicebytes(SPA_MINDEVSIZE, buf, sizeof (buf)); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "device is less than the minimum " "size (%s)"), buf); } (void) zfs_error(hdl, EZFS_BADDEV, errbuf); break; case ENOTSUP: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool must be upgraded to add these vdevs")); (void) zfs_error(hdl, EZFS_BADVERSION, errbuf); break; default: (void) zpool_standard_error(hdl, errno, errbuf); } ret = -1; } else { ret = 0; } zcmd_free_nvlists(&zc); return (ret); } /* * Exports the pool from the system. The caller must ensure that there are no * mounted datasets in the pool. */ static int zpool_export_common(zpool_handle_t *zhp, boolean_t force, boolean_t hardforce, const char *log_str) { zfs_cmd_t zc = {"\0"}; (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); zc.zc_cookie = force; zc.zc_guid = hardforce; zc.zc_history = (uint64_t)(uintptr_t)log_str; if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_POOL_EXPORT, &zc) != 0) { switch (errno) { case EXDEV: zfs_error_aux(zhp->zpool_hdl, dgettext(TEXT_DOMAIN, "use '-f' to override the following errors:\n" "'%s' has an active shared spare which could be" " used by other pools once '%s' is exported."), zhp->zpool_name, zhp->zpool_name); return (zfs_error_fmt(zhp->zpool_hdl, EZFS_ACTIVE_SPARE, dgettext(TEXT_DOMAIN, "cannot export '%s'"), zhp->zpool_name)); default: return (zpool_standard_error_fmt(zhp->zpool_hdl, errno, dgettext(TEXT_DOMAIN, "cannot export '%s'"), zhp->zpool_name)); } } return (0); } int zpool_export(zpool_handle_t *zhp, boolean_t force, const char *log_str) { return (zpool_export_common(zhp, force, B_FALSE, log_str)); } int zpool_export_force(zpool_handle_t *zhp, const char *log_str) { return (zpool_export_common(zhp, B_TRUE, B_TRUE, log_str)); } static void zpool_rewind_exclaim(libzfs_handle_t *hdl, const char *name, boolean_t dryrun, nvlist_t *config) { nvlist_t *nv = NULL; uint64_t rewindto; int64_t loss = -1; struct tm t; char timestr[128]; if (!hdl->libzfs_printerr || config == NULL) return; if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, &nv) != 0 || nvlist_lookup_nvlist(nv, ZPOOL_CONFIG_REWIND_INFO, &nv) != 0) { return; } if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_LOAD_TIME, &rewindto) != 0) return; (void) nvlist_lookup_int64(nv, ZPOOL_CONFIG_REWIND_TIME, &loss); if (localtime_r((time_t *)&rewindto, &t) != NULL && strftime(timestr, 128, "%c", &t) != 0) { if (dryrun) { (void) printf(dgettext(TEXT_DOMAIN, "Would be able to return %s " "to its state as of %s.\n"), name, timestr); } else { (void) printf(dgettext(TEXT_DOMAIN, "Pool %s returned to its state as of %s.\n"), name, timestr); } if (loss > 120) { (void) printf(dgettext(TEXT_DOMAIN, "%s approximately %lld "), dryrun ? "Would discard" : "Discarded", ((longlong_t)loss + 30) / 60); (void) printf(dgettext(TEXT_DOMAIN, "minutes of transactions.\n")); } else if (loss > 0) { (void) printf(dgettext(TEXT_DOMAIN, "%s approximately %lld "), dryrun ? "Would discard" : "Discarded", (longlong_t)loss); (void) printf(dgettext(TEXT_DOMAIN, "seconds of transactions.\n")); } } } void zpool_explain_recover(libzfs_handle_t *hdl, const char *name, int reason, nvlist_t *config) { nvlist_t *nv = NULL; int64_t loss = -1; uint64_t edata = UINT64_MAX; uint64_t rewindto; struct tm t; char timestr[128]; if (!hdl->libzfs_printerr) return; if (reason >= 0) (void) printf(dgettext(TEXT_DOMAIN, "action: ")); else (void) printf(dgettext(TEXT_DOMAIN, "\t")); /* All attempted rewinds failed if ZPOOL_CONFIG_LOAD_TIME missing */ if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, &nv) != 0 || nvlist_lookup_nvlist(nv, ZPOOL_CONFIG_REWIND_INFO, &nv) != 0 || nvlist_lookup_uint64(nv, ZPOOL_CONFIG_LOAD_TIME, &rewindto) != 0) goto no_info; (void) nvlist_lookup_int64(nv, ZPOOL_CONFIG_REWIND_TIME, &loss); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_LOAD_DATA_ERRORS, &edata); (void) printf(dgettext(TEXT_DOMAIN, "Recovery is possible, but will result in some data loss.\n")); if (localtime_r((time_t *)&rewindto, &t) != NULL && strftime(timestr, 128, "%c", &t) != 0) { (void) printf(dgettext(TEXT_DOMAIN, "\tReturning the pool to its state as of %s\n" "\tshould correct the problem. "), timestr); } else { (void) printf(dgettext(TEXT_DOMAIN, "\tReverting the pool to an earlier state " "should correct the problem.\n\t")); } if (loss > 120) { (void) printf(dgettext(TEXT_DOMAIN, "Approximately %lld minutes of data\n" "\tmust be discarded, irreversibly. "), ((longlong_t)loss + 30) / 60); } else if (loss > 0) { (void) printf(dgettext(TEXT_DOMAIN, "Approximately %lld seconds of data\n" "\tmust be discarded, irreversibly. "), (longlong_t)loss); } if (edata != 0 && edata != UINT64_MAX) { if (edata == 1) { (void) printf(dgettext(TEXT_DOMAIN, "After rewind, at least\n" "\tone persistent user-data error will remain. ")); } else { (void) printf(dgettext(TEXT_DOMAIN, "After rewind, several\n" "\tpersistent user-data errors will remain. ")); } } (void) printf(dgettext(TEXT_DOMAIN, "Recovery can be attempted\n\tby executing 'zpool %s -F %s'. "), reason >= 0 ? "clear" : "import", name); (void) printf(dgettext(TEXT_DOMAIN, "A scrub of the pool\n" "\tis strongly recommended after recovery.\n")); return; no_info: (void) printf(dgettext(TEXT_DOMAIN, "Destroy and re-create the pool from\n\ta backup source.\n")); } /* * zpool_import() is a contracted interface. Should be kept the same * if possible. * * Applications should use zpool_import_props() to import a pool with * new properties value to be set. */ int zpool_import(libzfs_handle_t *hdl, nvlist_t *config, const char *newname, char *altroot) { nvlist_t *props = NULL; int ret; if (altroot != NULL) { if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) { return (zfs_error_fmt(hdl, EZFS_NOMEM, dgettext(TEXT_DOMAIN, "cannot import '%s'"), newname)); } if (nvlist_add_string(props, zpool_prop_to_name(ZPOOL_PROP_ALTROOT), altroot) != 0 || nvlist_add_string(props, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), "none") != 0) { nvlist_free(props); return (zfs_error_fmt(hdl, EZFS_NOMEM, dgettext(TEXT_DOMAIN, "cannot import '%s'"), newname)); } } ret = zpool_import_props(hdl, config, newname, props, ZFS_IMPORT_NORMAL); nvlist_free(props); return (ret); } static void print_vdev_tree(libzfs_handle_t *hdl, const char *name, nvlist_t *nv, int indent) { nvlist_t **child; uint_t c, children; char *vname; uint64_t is_log = 0; (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &is_log); if (name != NULL) (void) printf("\t%*s%s%s\n", indent, "", name, is_log ? " [log]" : ""); if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) return; for (c = 0; c < children; c++) { vname = zpool_vdev_name(hdl, NULL, child[c], VDEV_NAME_TYPE_ID); print_vdev_tree(hdl, vname, child[c], indent + 2); free(vname); } } void zpool_print_unsup_feat(nvlist_t *config) { nvlist_t *nvinfo, *unsup_feat; nvinfo = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO); unsup_feat = fnvlist_lookup_nvlist(nvinfo, ZPOOL_CONFIG_UNSUP_FEAT); for (nvpair_t *nvp = nvlist_next_nvpair(unsup_feat, NULL); nvp != NULL; nvp = nvlist_next_nvpair(unsup_feat, nvp)) { char *desc = fnvpair_value_string(nvp); if (strlen(desc) > 0) (void) printf("\t%s (%s)\n", nvpair_name(nvp), desc); else (void) printf("\t%s\n", nvpair_name(nvp)); } } /* * Import the given pool using the known configuration and a list of * properties to be set. The configuration should have come from * zpool_find_import(). The 'newname' parameters control whether the pool * is imported with a different name. */ int zpool_import_props(libzfs_handle_t *hdl, nvlist_t *config, const char *newname, nvlist_t *props, int flags) { zfs_cmd_t zc = {"\0"}; zpool_load_policy_t policy; nvlist_t *nv = NULL; nvlist_t *nvinfo = NULL; nvlist_t *missing = NULL; const char *thename; char *origname; int ret; int error = 0; char errbuf[ERRBUFLEN]; origname = fnvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME); (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot import pool '%s'"), origname); if (newname != NULL) { if (!zpool_name_valid(hdl, B_FALSE, newname)) return (zfs_error_fmt(hdl, EZFS_INVALIDNAME, dgettext(TEXT_DOMAIN, "cannot import '%s'"), newname)); thename = newname; } else { thename = origname; } if (props != NULL) { uint64_t version; prop_flags_t flags = { .create = B_FALSE, .import = B_TRUE }; version = fnvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION); if ((props = zpool_valid_proplist(hdl, origname, props, version, flags, errbuf)) == NULL) return (-1); zcmd_write_src_nvlist(hdl, &zc, props); nvlist_free(props); } (void) strlcpy(zc.zc_name, thename, sizeof (zc.zc_name)); zc.zc_guid = fnvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID); zcmd_write_conf_nvlist(hdl, &zc, config); zcmd_alloc_dst_nvlist(hdl, &zc, zc.zc_nvlist_conf_size * 2); zc.zc_cookie = flags; while ((ret = zfs_ioctl(hdl, ZFS_IOC_POOL_IMPORT, &zc)) != 0 && errno == ENOMEM) zcmd_expand_dst_nvlist(hdl, &zc); if (ret != 0) error = errno; (void) zcmd_read_dst_nvlist(hdl, &zc, &nv); zcmd_free_nvlists(&zc); zpool_get_load_policy(config, &policy); if (error) { char desc[1024]; char aux[256]; /* * Dry-run failed, but we print out what success * looks like if we found a best txg */ if (policy.zlp_rewind & ZPOOL_TRY_REWIND) { zpool_rewind_exclaim(hdl, newname ? origname : thename, B_TRUE, nv); nvlist_free(nv); return (-1); } if (newname == NULL) (void) snprintf(desc, sizeof (desc), dgettext(TEXT_DOMAIN, "cannot import '%s'"), thename); else (void) snprintf(desc, sizeof (desc), dgettext(TEXT_DOMAIN, "cannot import '%s' as '%s'"), origname, thename); switch (error) { case ENOTSUP: if (nv != NULL && nvlist_lookup_nvlist(nv, ZPOOL_CONFIG_LOAD_INFO, &nvinfo) == 0 && nvlist_exists(nvinfo, ZPOOL_CONFIG_UNSUP_FEAT)) { (void) printf(dgettext(TEXT_DOMAIN, "This " "pool uses the following feature(s) not " "supported by this system:\n")); zpool_print_unsup_feat(nv); if (nvlist_exists(nvinfo, ZPOOL_CONFIG_CAN_RDONLY)) { (void) printf(dgettext(TEXT_DOMAIN, "All unsupported features are only " "required for writing to the pool." "\nThe pool can be imported using " "'-o readonly=on'.\n")); } } /* * Unsupported version. */ (void) zfs_error(hdl, EZFS_BADVERSION, desc); break; case EREMOTEIO: if (nv != NULL && nvlist_lookup_nvlist(nv, ZPOOL_CONFIG_LOAD_INFO, &nvinfo) == 0) { const char *hostname = ""; uint64_t hostid = 0; mmp_state_t mmp_state; mmp_state = fnvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_STATE); if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_HOSTNAME)) hostname = fnvlist_lookup_string(nvinfo, ZPOOL_CONFIG_MMP_HOSTNAME); if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_HOSTID)) hostid = fnvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_HOSTID); if (mmp_state == MMP_STATE_ACTIVE) { (void) snprintf(aux, sizeof (aux), dgettext(TEXT_DOMAIN, "pool is imp" "orted on host '%s' (hostid=%lx).\n" "Export the pool on the other " "system, then run 'zpool import'."), hostname, (unsigned long) hostid); } else if (mmp_state == MMP_STATE_NO_HOSTID) { (void) snprintf(aux, sizeof (aux), dgettext(TEXT_DOMAIN, "pool has " "the multihost property on and " "the\nsystem's hostid is not set. " "Set a unique system hostid with " "the zgenhostid(8) command.\n")); } (void) zfs_error_aux(hdl, "%s", aux); } (void) zfs_error(hdl, EZFS_ACTIVE_POOL, desc); break; case EINVAL: (void) zfs_error(hdl, EZFS_INVALCONFIG, desc); break; case EROFS: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "one or more devices is read only")); (void) zfs_error(hdl, EZFS_BADDEV, desc); break; case ENXIO: if (nv && nvlist_lookup_nvlist(nv, ZPOOL_CONFIG_LOAD_INFO, &nvinfo) == 0 && nvlist_lookup_nvlist(nvinfo, ZPOOL_CONFIG_MISSING_DEVICES, &missing) == 0) { (void) printf(dgettext(TEXT_DOMAIN, "The devices below are missing or " "corrupted, use '-m' to import the pool " "anyway:\n")); print_vdev_tree(hdl, NULL, missing, 2); (void) printf("\n"); } (void) zpool_standard_error(hdl, error, desc); break; case EEXIST: (void) zpool_standard_error(hdl, error, desc); break; case EBUSY: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "one or more devices are already in use\n")); (void) zfs_error(hdl, EZFS_BADDEV, desc); break; case ENAMETOOLONG: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "new name of at least one dataset is longer than " "the maximum allowable length")); (void) zfs_error(hdl, EZFS_NAMETOOLONG, desc); break; default: (void) zpool_standard_error(hdl, error, desc); zpool_explain_recover(hdl, newname ? origname : thename, -error, nv); break; } nvlist_free(nv); ret = -1; } else { zpool_handle_t *zhp; /* * This should never fail, but play it safe anyway. */ if (zpool_open_silent(hdl, thename, &zhp) != 0) ret = -1; else if (zhp != NULL) zpool_close(zhp); if (policy.zlp_rewind & (ZPOOL_DO_REWIND | ZPOOL_TRY_REWIND)) { zpool_rewind_exclaim(hdl, newname ? origname : thename, ((policy.zlp_rewind & ZPOOL_TRY_REWIND) != 0), nv); } nvlist_free(nv); - return (0); } return (ret); } /* * Translate vdev names to guids. If a vdev_path is determined to be * unsuitable then a vd_errlist is allocated and the vdev path and errno * are added to it. */ static int zpool_translate_vdev_guids(zpool_handle_t *zhp, nvlist_t *vds, nvlist_t *vdev_guids, nvlist_t *guids_to_paths, nvlist_t **vd_errlist) { nvlist_t *errlist = NULL; int error = 0; for (nvpair_t *elem = nvlist_next_nvpair(vds, NULL); elem != NULL; elem = nvlist_next_nvpair(vds, elem)) { boolean_t spare, cache; char *vd_path = nvpair_name(elem); nvlist_t *tgt = zpool_find_vdev(zhp, vd_path, &spare, &cache, NULL); if ((tgt == NULL) || cache || spare) { if (errlist == NULL) { errlist = fnvlist_alloc(); error = EINVAL; } uint64_t err = (tgt == NULL) ? EZFS_NODEVICE : (spare ? EZFS_ISSPARE : EZFS_ISL2CACHE); fnvlist_add_int64(errlist, vd_path, err); continue; } uint64_t guid = fnvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID); fnvlist_add_uint64(vdev_guids, vd_path, guid); char msg[MAXNAMELEN]; (void) snprintf(msg, sizeof (msg), "%llu", (u_longlong_t)guid); fnvlist_add_string(guids_to_paths, msg, vd_path); } if (error != 0) { verify(errlist != NULL); if (vd_errlist != NULL) *vd_errlist = errlist; else fnvlist_free(errlist); } return (error); } static int xlate_init_err(int err) { switch (err) { case ENODEV: return (EZFS_NODEVICE); case EINVAL: case EROFS: return (EZFS_BADDEV); case EBUSY: return (EZFS_INITIALIZING); case ESRCH: return (EZFS_NO_INITIALIZE); } return (err); } /* * Begin, suspend, or cancel the initialization (initializing of all free * blocks) for the given vdevs in the given pool. */ static int zpool_initialize_impl(zpool_handle_t *zhp, pool_initialize_func_t cmd_type, nvlist_t *vds, boolean_t wait) { int err; nvlist_t *vdev_guids = fnvlist_alloc(); nvlist_t *guids_to_paths = fnvlist_alloc(); nvlist_t *vd_errlist = NULL; nvlist_t *errlist; nvpair_t *elem; err = zpool_translate_vdev_guids(zhp, vds, vdev_guids, guids_to_paths, &vd_errlist); if (err != 0) { verify(vd_errlist != NULL); goto list_errors; } err = lzc_initialize(zhp->zpool_name, cmd_type, vdev_guids, &errlist); if (err != 0) { if (errlist != NULL) { vd_errlist = fnvlist_lookup_nvlist(errlist, ZPOOL_INITIALIZE_VDEVS); goto list_errors; } (void) zpool_standard_error(zhp->zpool_hdl, err, dgettext(TEXT_DOMAIN, "operation failed")); goto out; } if (wait) { for (elem = nvlist_next_nvpair(vdev_guids, NULL); elem != NULL; elem = nvlist_next_nvpair(vdev_guids, elem)) { uint64_t guid = fnvpair_value_uint64(elem); err = lzc_wait_tag(zhp->zpool_name, ZPOOL_WAIT_INITIALIZE, guid, NULL); if (err != 0) { (void) zpool_standard_error_fmt(zhp->zpool_hdl, err, dgettext(TEXT_DOMAIN, "error " "waiting for '%s' to initialize"), nvpair_name(elem)); goto out; } } } goto out; list_errors: for (elem = nvlist_next_nvpair(vd_errlist, NULL); elem != NULL; elem = nvlist_next_nvpair(vd_errlist, elem)) { int64_t vd_error = xlate_init_err(fnvpair_value_int64(elem)); char *path; if (nvlist_lookup_string(guids_to_paths, nvpair_name(elem), &path) != 0) path = nvpair_name(elem); (void) zfs_error_fmt(zhp->zpool_hdl, vd_error, "cannot initialize '%s'", path); } out: fnvlist_free(vdev_guids); fnvlist_free(guids_to_paths); if (vd_errlist != NULL) fnvlist_free(vd_errlist); return (err == 0 ? 0 : -1); } int zpool_initialize(zpool_handle_t *zhp, pool_initialize_func_t cmd_type, nvlist_t *vds) { return (zpool_initialize_impl(zhp, cmd_type, vds, B_FALSE)); } int zpool_initialize_wait(zpool_handle_t *zhp, pool_initialize_func_t cmd_type, nvlist_t *vds) { return (zpool_initialize_impl(zhp, cmd_type, vds, B_TRUE)); } static int xlate_trim_err(int err) { switch (err) { case ENODEV: return (EZFS_NODEVICE); case EINVAL: case EROFS: return (EZFS_BADDEV); case EBUSY: return (EZFS_TRIMMING); case ESRCH: return (EZFS_NO_TRIM); case EOPNOTSUPP: return (EZFS_TRIM_NOTSUP); } return (err); } static int zpool_trim_wait(zpool_handle_t *zhp, nvlist_t *vdev_guids) { int err; nvpair_t *elem; for (elem = nvlist_next_nvpair(vdev_guids, NULL); elem != NULL; elem = nvlist_next_nvpair(vdev_guids, elem)) { uint64_t guid = fnvpair_value_uint64(elem); err = lzc_wait_tag(zhp->zpool_name, ZPOOL_WAIT_TRIM, guid, NULL); if (err != 0) { (void) zpool_standard_error_fmt(zhp->zpool_hdl, err, dgettext(TEXT_DOMAIN, "error " "waiting to trim '%s'"), nvpair_name(elem)); return (err); } } return (0); } /* * Check errlist and report any errors, omitting ones which should be * suppressed. Returns B_TRUE if any errors were reported. */ static boolean_t check_trim_errs(zpool_handle_t *zhp, trimflags_t *trim_flags, nvlist_t *guids_to_paths, nvlist_t *vds, nvlist_t *errlist) { nvpair_t *elem; boolean_t reported_errs = B_FALSE; int num_vds = 0; int num_suppressed_errs = 0; for (elem = nvlist_next_nvpair(vds, NULL); elem != NULL; elem = nvlist_next_nvpair(vds, elem)) { num_vds++; } for (elem = nvlist_next_nvpair(errlist, NULL); elem != NULL; elem = nvlist_next_nvpair(errlist, elem)) { int64_t vd_error = xlate_trim_err(fnvpair_value_int64(elem)); char *path; /* * If only the pool was specified, and it was not a secure * trim then suppress warnings for individual vdevs which * do not support trimming. */ if (vd_error == EZFS_TRIM_NOTSUP && trim_flags->fullpool && !trim_flags->secure) { num_suppressed_errs++; continue; } reported_errs = B_TRUE; if (nvlist_lookup_string(guids_to_paths, nvpair_name(elem), &path) != 0) path = nvpair_name(elem); (void) zfs_error_fmt(zhp->zpool_hdl, vd_error, "cannot trim '%s'", path); } if (num_suppressed_errs == num_vds) { (void) zfs_error_aux(zhp->zpool_hdl, dgettext(TEXT_DOMAIN, "no devices in pool support trim operations")); (void) (zfs_error(zhp->zpool_hdl, EZFS_TRIM_NOTSUP, dgettext(TEXT_DOMAIN, "cannot trim"))); reported_errs = B_TRUE; } return (reported_errs); } /* * Begin, suspend, or cancel the TRIM (discarding of all free blocks) for * the given vdevs in the given pool. */ int zpool_trim(zpool_handle_t *zhp, pool_trim_func_t cmd_type, nvlist_t *vds, trimflags_t *trim_flags) { int err; int retval = 0; nvlist_t *vdev_guids = fnvlist_alloc(); nvlist_t *guids_to_paths = fnvlist_alloc(); nvlist_t *errlist = NULL; err = zpool_translate_vdev_guids(zhp, vds, vdev_guids, guids_to_paths, &errlist); if (err != 0) { check_trim_errs(zhp, trim_flags, guids_to_paths, vds, errlist); retval = -1; goto out; } err = lzc_trim(zhp->zpool_name, cmd_type, trim_flags->rate, trim_flags->secure, vdev_guids, &errlist); if (err != 0) { nvlist_t *vd_errlist; if (errlist != NULL && nvlist_lookup_nvlist(errlist, ZPOOL_TRIM_VDEVS, &vd_errlist) == 0) { if (check_trim_errs(zhp, trim_flags, guids_to_paths, vds, vd_errlist)) { retval = -1; goto out; } } else { char errbuf[ERRBUFLEN]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "operation failed")); zpool_standard_error(zhp->zpool_hdl, err, errbuf); retval = -1; goto out; } } if (trim_flags->wait) retval = zpool_trim_wait(zhp, vdev_guids); out: if (errlist != NULL) fnvlist_free(errlist); fnvlist_free(vdev_guids); fnvlist_free(guids_to_paths); return (retval); } /* * Scan the pool. */ int zpool_scan(zpool_handle_t *zhp, pool_scan_func_t func, pool_scrub_cmd_t cmd) { zfs_cmd_t zc = {"\0"}; char errbuf[ERRBUFLEN]; int err; libzfs_handle_t *hdl = zhp->zpool_hdl; (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); zc.zc_cookie = func; zc.zc_flags = cmd; if (zfs_ioctl(hdl, ZFS_IOC_POOL_SCAN, &zc) == 0) return (0); err = errno; /* ECANCELED on a scrub means we resumed a paused scrub */ if (err == ECANCELED && func == POOL_SCAN_SCRUB && cmd == POOL_SCRUB_NORMAL) return (0); if (err == ENOENT && func != POOL_SCAN_NONE && cmd == POOL_SCRUB_NORMAL) return (0); if (func == POOL_SCAN_SCRUB) { if (cmd == POOL_SCRUB_PAUSE) { (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot pause scrubbing %s"), zc.zc_name); } else { assert(cmd == POOL_SCRUB_NORMAL); (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot scrub %s"), zc.zc_name); } } else if (func == POOL_SCAN_RESILVER) { assert(cmd == POOL_SCRUB_NORMAL); (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot restart resilver on %s"), zc.zc_name); } else if (func == POOL_SCAN_NONE) { (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot cancel scrubbing %s"), zc.zc_name); } else { assert(!"unexpected result"); } if (err == EBUSY) { nvlist_t *nvroot; pool_scan_stat_t *ps = NULL; uint_t psc; nvroot = fnvlist_lookup_nvlist(zhp->zpool_config, ZPOOL_CONFIG_VDEV_TREE); (void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_SCAN_STATS, (uint64_t **)&ps, &psc); if (ps && ps->pss_func == POOL_SCAN_SCRUB && ps->pss_state == DSS_SCANNING) { if (cmd == POOL_SCRUB_PAUSE) return (zfs_error(hdl, EZFS_SCRUB_PAUSED, errbuf)); else return (zfs_error(hdl, EZFS_SCRUBBING, errbuf)); } else { return (zfs_error(hdl, EZFS_RESILVERING, errbuf)); } } else if (err == ENOENT) { return (zfs_error(hdl, EZFS_NO_SCRUB, errbuf)); } else if (err == ENOTSUP && func == POOL_SCAN_RESILVER) { return (zfs_error(hdl, EZFS_NO_RESILVER_DEFER, errbuf)); } else { return (zpool_standard_error(hdl, err, errbuf)); } } /* * Find a vdev that matches the search criteria specified. We use the * the nvpair name to determine how we should look for the device. * 'avail_spare' is set to TRUE if the provided guid refers to an AVAIL * spare; but FALSE if its an INUSE spare. */ static nvlist_t * vdev_to_nvlist_iter(nvlist_t *nv, nvlist_t *search, boolean_t *avail_spare, boolean_t *l2cache, boolean_t *log) { uint_t c, children; nvlist_t **child; nvlist_t *ret; uint64_t is_log; char *srchkey; nvpair_t *pair = nvlist_next_nvpair(search, NULL); /* Nothing to look for */ if (search == NULL || pair == NULL) return (NULL); /* Obtain the key we will use to search */ srchkey = nvpair_name(pair); switch (nvpair_type(pair)) { case DATA_TYPE_UINT64: if (strcmp(srchkey, ZPOOL_CONFIG_GUID) == 0) { uint64_t srchval = fnvpair_value_uint64(pair); uint64_t theguid = fnvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID); if (theguid == srchval) return (nv); } break; case DATA_TYPE_STRING: { char *srchval, *val; srchval = fnvpair_value_string(pair); if (nvlist_lookup_string(nv, srchkey, &val) != 0) break; /* * Search for the requested value. Special cases: * * - ZPOOL_CONFIG_PATH for whole disk entries. These end in * "-part1", or "p1". The suffix is hidden from the user, * but included in the string, so this matches around it. * - ZPOOL_CONFIG_PATH for short names zfs_strcmp_shortname() * is used to check all possible expanded paths. * - looking for a top-level vdev name (i.e. ZPOOL_CONFIG_TYPE). * * Otherwise, all other searches are simple string compares. */ if (strcmp(srchkey, ZPOOL_CONFIG_PATH) == 0) { uint64_t wholedisk = 0; (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, &wholedisk); if (zfs_strcmp_pathname(srchval, val, wholedisk) == 0) return (nv); } else if (strcmp(srchkey, ZPOOL_CONFIG_TYPE) == 0 && val) { char *type, *idx, *end, *p; uint64_t id, vdev_id; /* * Determine our vdev type, keeping in mind * that the srchval is composed of a type and * vdev id pair (i.e. mirror-4). */ if ((type = strdup(srchval)) == NULL) return (NULL); if ((p = strrchr(type, '-')) == NULL) { free(type); break; } idx = p + 1; *p = '\0'; /* * If the types don't match then keep looking. */ if (strncmp(val, type, strlen(val)) != 0) { free(type); break; } verify(zpool_vdev_is_interior(type)); id = fnvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID); errno = 0; vdev_id = strtoull(idx, &end, 10); /* * If we are looking for a raidz and a parity is * specified, make sure it matches. */ int rzlen = strlen(VDEV_TYPE_RAIDZ); assert(rzlen == strlen(VDEV_TYPE_DRAID)); int typlen = strlen(type); if ((strncmp(type, VDEV_TYPE_RAIDZ, rzlen) == 0 || strncmp(type, VDEV_TYPE_DRAID, rzlen) == 0) && typlen != rzlen) { uint64_t vdev_parity; int parity = *(type + rzlen) - '0'; if (parity <= 0 || parity > 3 || (typlen - rzlen) != 1) { /* * Nonsense parity specified, can * never match */ free(type); return (NULL); } vdev_parity = fnvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY); if ((int)vdev_parity != parity) { free(type); break; } } free(type); if (errno != 0) return (NULL); /* * Now verify that we have the correct vdev id. */ if (vdev_id == id) return (nv); } /* * Common case */ if (strcmp(srchval, val) == 0) return (nv); break; } default: break; } if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) return (NULL); for (c = 0; c < children; c++) { if ((ret = vdev_to_nvlist_iter(child[c], search, avail_spare, l2cache, NULL)) != NULL) { /* * The 'is_log' value is only set for the toplevel * vdev, not the leaf vdevs. So we always lookup the * log device from the root of the vdev tree (where * 'log' is non-NULL). */ if (log != NULL && nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, &is_log) == 0 && is_log) { *log = B_TRUE; } return (ret); } } if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, &child, &children) == 0) { for (c = 0; c < children; c++) { if ((ret = vdev_to_nvlist_iter(child[c], search, avail_spare, l2cache, NULL)) != NULL) { *avail_spare = B_TRUE; return (ret); } } } if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, &child, &children) == 0) { for (c = 0; c < children; c++) { if ((ret = vdev_to_nvlist_iter(child[c], search, avail_spare, l2cache, NULL)) != NULL) { *l2cache = B_TRUE; return (ret); } } } return (NULL); } /* * Given a physical path or guid, find the associated vdev. */ nvlist_t * zpool_find_vdev_by_physpath(zpool_handle_t *zhp, const char *ppath, boolean_t *avail_spare, boolean_t *l2cache, boolean_t *log) { nvlist_t *search, *nvroot, *ret; uint64_t guid; char *end; search = fnvlist_alloc(); guid = strtoull(ppath, &end, 0); if (guid != 0 && *end == '\0') { fnvlist_add_uint64(search, ZPOOL_CONFIG_GUID, guid); } else { fnvlist_add_string(search, ZPOOL_CONFIG_PHYS_PATH, ppath); } nvroot = fnvlist_lookup_nvlist(zhp->zpool_config, ZPOOL_CONFIG_VDEV_TREE); *avail_spare = B_FALSE; *l2cache = B_FALSE; if (log != NULL) *log = B_FALSE; ret = vdev_to_nvlist_iter(nvroot, search, avail_spare, l2cache, log); fnvlist_free(search); return (ret); } /* * Determine if we have an "interior" top-level vdev (i.e mirror/raidz). */ static boolean_t zpool_vdev_is_interior(const char *name) { if (strncmp(name, VDEV_TYPE_RAIDZ, strlen(VDEV_TYPE_RAIDZ)) == 0 || strncmp(name, VDEV_TYPE_SPARE, strlen(VDEV_TYPE_SPARE)) == 0 || strncmp(name, VDEV_TYPE_REPLACING, strlen(VDEV_TYPE_REPLACING)) == 0 || strncmp(name, VDEV_TYPE_MIRROR, strlen(VDEV_TYPE_MIRROR)) == 0) return (B_TRUE); if (strncmp(name, VDEV_TYPE_DRAID, strlen(VDEV_TYPE_DRAID)) == 0 && !zpool_is_draid_spare(name)) return (B_TRUE); return (B_FALSE); } nvlist_t * zpool_find_vdev(zpool_handle_t *zhp, const char *path, boolean_t *avail_spare, boolean_t *l2cache, boolean_t *log) { char *end; nvlist_t *nvroot, *search, *ret; uint64_t guid; search = fnvlist_alloc(); guid = strtoull(path, &end, 0); if (guid != 0 && *end == '\0') { fnvlist_add_uint64(search, ZPOOL_CONFIG_GUID, guid); } else if (zpool_vdev_is_interior(path)) { fnvlist_add_string(search, ZPOOL_CONFIG_TYPE, path); } else { fnvlist_add_string(search, ZPOOL_CONFIG_PATH, path); } nvroot = fnvlist_lookup_nvlist(zhp->zpool_config, ZPOOL_CONFIG_VDEV_TREE); *avail_spare = B_FALSE; *l2cache = B_FALSE; if (log != NULL) *log = B_FALSE; ret = vdev_to_nvlist_iter(nvroot, search, avail_spare, l2cache, log); fnvlist_free(search); return (ret); } /* * Convert a vdev path to a GUID. Returns GUID or 0 on error. * * If is_spare, is_l2cache, or is_log is non-NULL, then store within it * if the VDEV is a spare, l2cache, or log device. If they're NULL then * ignore them. */ static uint64_t zpool_vdev_path_to_guid_impl(zpool_handle_t *zhp, const char *path, boolean_t *is_spare, boolean_t *is_l2cache, boolean_t *is_log) { boolean_t spare = B_FALSE, l2cache = B_FALSE, log = B_FALSE; nvlist_t *tgt; if ((tgt = zpool_find_vdev(zhp, path, &spare, &l2cache, &log)) == NULL) return (0); if (is_spare != NULL) *is_spare = spare; if (is_l2cache != NULL) *is_l2cache = l2cache; if (is_log != NULL) *is_log = log; return (fnvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID)); } /* Convert a vdev path to a GUID. Returns GUID or 0 on error. */ uint64_t zpool_vdev_path_to_guid(zpool_handle_t *zhp, const char *path) { return (zpool_vdev_path_to_guid_impl(zhp, path, NULL, NULL, NULL)); } /* * Bring the specified vdev online. The 'flags' parameter is a set of the * ZFS_ONLINE_* flags. */ int zpool_vdev_online(zpool_handle_t *zhp, const char *path, int flags, vdev_state_t *newstate) { zfs_cmd_t zc = {"\0"}; char errbuf[ERRBUFLEN]; nvlist_t *tgt; boolean_t avail_spare, l2cache, islog; libzfs_handle_t *hdl = zhp->zpool_hdl; if (flags & ZFS_ONLINE_EXPAND) { (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot expand %s"), path); } else { (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot online %s"), path); } (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache, &islog)) == NULL) return (zfs_error(hdl, EZFS_NODEVICE, errbuf)); zc.zc_guid = fnvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID); if (avail_spare) return (zfs_error(hdl, EZFS_ISSPARE, errbuf)); #ifndef __FreeBSD__ char *pathname; if ((flags & ZFS_ONLINE_EXPAND || zpool_get_prop_int(zhp, ZPOOL_PROP_AUTOEXPAND, NULL)) && nvlist_lookup_string(tgt, ZPOOL_CONFIG_PATH, &pathname) == 0) { uint64_t wholedisk = 0; (void) nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_WHOLE_DISK, &wholedisk); /* * XXX - L2ARC 1.0 devices can't support expansion. */ if (l2cache) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot expand cache devices")); return (zfs_error(hdl, EZFS_VDEVNOTSUP, errbuf)); } if (wholedisk) { const char *fullpath = path; char buf[MAXPATHLEN]; int error; if (path[0] != '/') { error = zfs_resolve_shortname(path, buf, sizeof (buf)); if (error != 0) return (zfs_error(hdl, EZFS_NODEVICE, errbuf)); fullpath = buf; } error = zpool_relabel_disk(hdl, fullpath, errbuf); if (error != 0) return (error); } } #endif zc.zc_cookie = VDEV_STATE_ONLINE; zc.zc_obj = flags; if (zfs_ioctl(hdl, ZFS_IOC_VDEV_SET_STATE, &zc) != 0) { if (errno == EINVAL) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "was split " "from this pool into a new one. Use '%s' " "instead"), "zpool detach"); return (zfs_error(hdl, EZFS_POSTSPLIT_ONLINE, errbuf)); } return (zpool_standard_error(hdl, errno, errbuf)); } *newstate = zc.zc_cookie; return (0); } /* * Take the specified vdev offline */ int zpool_vdev_offline(zpool_handle_t *zhp, const char *path, boolean_t istmp) { zfs_cmd_t zc = {"\0"}; char errbuf[ERRBUFLEN]; nvlist_t *tgt; boolean_t avail_spare, l2cache; libzfs_handle_t *hdl = zhp->zpool_hdl; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot offline %s"), path); (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache, NULL)) == NULL) return (zfs_error(hdl, EZFS_NODEVICE, errbuf)); zc.zc_guid = fnvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID); if (avail_spare) return (zfs_error(hdl, EZFS_ISSPARE, errbuf)); zc.zc_cookie = VDEV_STATE_OFFLINE; zc.zc_obj = istmp ? ZFS_OFFLINE_TEMPORARY : 0; if (zfs_ioctl(hdl, ZFS_IOC_VDEV_SET_STATE, &zc) == 0) return (0); switch (errno) { case EBUSY: /* * There are no other replicas of this device. */ return (zfs_error(hdl, EZFS_NOREPLICAS, errbuf)); case EEXIST: /* * The log device has unplayed logs */ return (zfs_error(hdl, EZFS_UNPLAYED_LOGS, errbuf)); default: return (zpool_standard_error(hdl, errno, errbuf)); } } /* * Remove the specified vdev asynchronously from the configuration, so * that it may come ONLINE if reinserted. This is called from zed on * Udev remove event. * Note: We also have a similar function zpool_vdev_remove() that * removes the vdev from the pool. */ int zpool_vdev_remove_wanted(zpool_handle_t *zhp, const char *path) { zfs_cmd_t zc = {"\0"}; char errbuf[ERRBUFLEN]; nvlist_t *tgt; boolean_t avail_spare, l2cache; libzfs_handle_t *hdl = zhp->zpool_hdl; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot remove %s"), path); (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache, NULL)) == NULL) return (zfs_error(hdl, EZFS_NODEVICE, errbuf)); zc.zc_guid = fnvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID); if (avail_spare) return (zfs_error(hdl, EZFS_ISSPARE, errbuf)); zc.zc_cookie = VDEV_STATE_REMOVED; if (zfs_ioctl(hdl, ZFS_IOC_VDEV_SET_STATE, &zc) == 0) return (0); return (zpool_standard_error(hdl, errno, errbuf)); } /* * Mark the given vdev faulted. */ int zpool_vdev_fault(zpool_handle_t *zhp, uint64_t guid, vdev_aux_t aux) { zfs_cmd_t zc = {"\0"}; char errbuf[ERRBUFLEN]; libzfs_handle_t *hdl = zhp->zpool_hdl; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot fault %llu"), (u_longlong_t)guid); (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); zc.zc_guid = guid; zc.zc_cookie = VDEV_STATE_FAULTED; zc.zc_obj = aux; if (zfs_ioctl(hdl, ZFS_IOC_VDEV_SET_STATE, &zc) == 0) return (0); switch (errno) { case EBUSY: /* * There are no other replicas of this device. */ return (zfs_error(hdl, EZFS_NOREPLICAS, errbuf)); default: return (zpool_standard_error(hdl, errno, errbuf)); } } /* * Mark the given vdev degraded. */ int zpool_vdev_degrade(zpool_handle_t *zhp, uint64_t guid, vdev_aux_t aux) { zfs_cmd_t zc = {"\0"}; char errbuf[ERRBUFLEN]; libzfs_handle_t *hdl = zhp->zpool_hdl; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot degrade %llu"), (u_longlong_t)guid); (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); zc.zc_guid = guid; zc.zc_cookie = VDEV_STATE_DEGRADED; zc.zc_obj = aux; if (zfs_ioctl(hdl, ZFS_IOC_VDEV_SET_STATE, &zc) == 0) return (0); return (zpool_standard_error(hdl, errno, errbuf)); } /* * Returns TRUE if the given nvlist is a vdev that was originally swapped in as * a hot spare. */ static boolean_t is_replacing_spare(nvlist_t *search, nvlist_t *tgt, int which) { nvlist_t **child; uint_t c, children; if (nvlist_lookup_nvlist_array(search, ZPOOL_CONFIG_CHILDREN, &child, &children) == 0) { char *type = fnvlist_lookup_string(search, ZPOOL_CONFIG_TYPE); if ((strcmp(type, VDEV_TYPE_SPARE) == 0 || strcmp(type, VDEV_TYPE_DRAID_SPARE) == 0) && children == 2 && child[which] == tgt) return (B_TRUE); for (c = 0; c < children; c++) if (is_replacing_spare(child[c], tgt, which)) return (B_TRUE); } return (B_FALSE); } /* * Attach new_disk (fully described by nvroot) to old_disk. * If 'replacing' is specified, the new disk will replace the old one. */ int zpool_vdev_attach(zpool_handle_t *zhp, const char *old_disk, const char *new_disk, nvlist_t *nvroot, int replacing, boolean_t rebuild) { zfs_cmd_t zc = {"\0"}; char errbuf[ERRBUFLEN]; int ret; nvlist_t *tgt; boolean_t avail_spare, l2cache, islog; uint64_t val; char *newname; nvlist_t **child; uint_t children; nvlist_t *config_root; libzfs_handle_t *hdl = zhp->zpool_hdl; if (replacing) (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot replace %s with %s"), old_disk, new_disk); else (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot attach %s to %s"), new_disk, old_disk); (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); if ((tgt = zpool_find_vdev(zhp, old_disk, &avail_spare, &l2cache, &islog)) == NULL) return (zfs_error(hdl, EZFS_NODEVICE, errbuf)); if (avail_spare) return (zfs_error(hdl, EZFS_ISSPARE, errbuf)); if (l2cache) return (zfs_error(hdl, EZFS_ISL2CACHE, errbuf)); zc.zc_guid = fnvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID); zc.zc_cookie = replacing; zc.zc_simple = rebuild; if (rebuild && zfeature_lookup_guid("org.openzfs:device_rebuild", NULL) != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "the loaded zfs module doesn't support device rebuilds")); return (zfs_error(hdl, EZFS_POOL_NOTSUP, errbuf)); } if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0 || children != 1) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "new device must be a single disk")); return (zfs_error(hdl, EZFS_INVALCONFIG, errbuf)); } config_root = fnvlist_lookup_nvlist(zpool_get_config(zhp, NULL), ZPOOL_CONFIG_VDEV_TREE); if ((newname = zpool_vdev_name(NULL, NULL, child[0], 0)) == NULL) return (-1); /* * If the target is a hot spare that has been swapped in, we can only * replace it with another hot spare. */ if (replacing && nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_IS_SPARE, &val) == 0 && (zpool_find_vdev(zhp, newname, &avail_spare, &l2cache, NULL) == NULL || !avail_spare) && is_replacing_spare(config_root, tgt, 1)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "can only be replaced by another hot spare")); free(newname); return (zfs_error(hdl, EZFS_BADTARGET, errbuf)); } free(newname); zcmd_write_conf_nvlist(hdl, &zc, nvroot); ret = zfs_ioctl(hdl, ZFS_IOC_VDEV_ATTACH, &zc); zcmd_free_nvlists(&zc); if (ret == 0) return (0); switch (errno) { case ENOTSUP: /* * Can't attach to or replace this type of vdev. */ if (replacing) { uint64_t version = zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL); if (islog) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot replace a log with a spare")); } else if (rebuild) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "only mirror and dRAID vdevs support " "sequential reconstruction")); } else if (zpool_is_draid_spare(new_disk)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "dRAID spares can only replace child " "devices in their parent's dRAID vdev")); } else if (version >= SPA_VERSION_MULTI_REPLACE) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "already in replacing/spare config; wait " "for completion or use 'zpool detach'")); } else { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot replace a replacing device")); } } else { char status[64] = {0}; zpool_prop_get_feature(zhp, "feature@device_rebuild", status, 63); if (rebuild && strncmp(status, ZFS_FEATURE_DISABLED, 64) == 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "device_rebuild feature must be enabled " "in order to use sequential " "reconstruction")); } else { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "can only attach to mirrors and top-level " "disks")); } } (void) zfs_error(hdl, EZFS_BADTARGET, errbuf); break; case EINVAL: /* * The new device must be a single disk. */ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "new device must be a single disk")); (void) zfs_error(hdl, EZFS_INVALCONFIG, errbuf); break; case EBUSY: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "%s is busy, " "or device removal is in progress"), new_disk); (void) zfs_error(hdl, EZFS_BADDEV, errbuf); break; case EOVERFLOW: /* * The new device is too small. */ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "device is too small")); (void) zfs_error(hdl, EZFS_BADDEV, errbuf); break; case EDOM: /* * The new device has a different optimal sector size. */ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "new device has a different optimal sector size; use the " "option '-o ashift=N' to override the optimal size")); (void) zfs_error(hdl, EZFS_BADDEV, errbuf); break; case ENAMETOOLONG: /* * The resulting top-level vdev spec won't fit in the label. */ (void) zfs_error(hdl, EZFS_DEVOVERFLOW, errbuf); break; default: (void) zpool_standard_error(hdl, errno, errbuf); } return (-1); } /* * Detach the specified device. */ int zpool_vdev_detach(zpool_handle_t *zhp, const char *path) { zfs_cmd_t zc = {"\0"}; char errbuf[ERRBUFLEN]; nvlist_t *tgt; boolean_t avail_spare, l2cache; libzfs_handle_t *hdl = zhp->zpool_hdl; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot detach %s"), path); (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache, NULL)) == NULL) return (zfs_error(hdl, EZFS_NODEVICE, errbuf)); if (avail_spare) return (zfs_error(hdl, EZFS_ISSPARE, errbuf)); if (l2cache) return (zfs_error(hdl, EZFS_ISL2CACHE, errbuf)); zc.zc_guid = fnvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID); if (zfs_ioctl(hdl, ZFS_IOC_VDEV_DETACH, &zc) == 0) return (0); switch (errno) { case ENOTSUP: /* * Can't detach from this type of vdev. */ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "only " "applicable to mirror and replacing vdevs")); (void) zfs_error(hdl, EZFS_BADTARGET, errbuf); break; case EBUSY: /* * There are no other replicas of this device. */ (void) zfs_error(hdl, EZFS_NOREPLICAS, errbuf); break; default: (void) zpool_standard_error(hdl, errno, errbuf); } return (-1); } /* * Find a mirror vdev in the source nvlist. * * The mchild array contains a list of disks in one of the top-level mirrors * of the source pool. The schild array contains a list of disks that the * user specified on the command line. We loop over the mchild array to * see if any entry in the schild array matches. * * If a disk in the mchild array is found in the schild array, we return * the index of that entry. Otherwise we return -1. */ static int find_vdev_entry(zpool_handle_t *zhp, nvlist_t **mchild, uint_t mchildren, nvlist_t **schild, uint_t schildren) { uint_t mc; for (mc = 0; mc < mchildren; mc++) { uint_t sc; char *mpath = zpool_vdev_name(zhp->zpool_hdl, zhp, mchild[mc], 0); for (sc = 0; sc < schildren; sc++) { char *spath = zpool_vdev_name(zhp->zpool_hdl, zhp, schild[sc], 0); boolean_t result = (strcmp(mpath, spath) == 0); free(spath); if (result) { free(mpath); return (mc); } } free(mpath); } return (-1); } /* * Split a mirror pool. If newroot points to null, then a new nvlist * is generated and it is the responsibility of the caller to free it. */ int zpool_vdev_split(zpool_handle_t *zhp, char *newname, nvlist_t **newroot, nvlist_t *props, splitflags_t flags) { zfs_cmd_t zc = {"\0"}; char errbuf[ERRBUFLEN], *bias; nvlist_t *tree, *config, **child, **newchild, *newconfig = NULL; nvlist_t **varray = NULL, *zc_props = NULL; uint_t c, children, newchildren, lastlog = 0, vcount, found = 0; libzfs_handle_t *hdl = zhp->zpool_hdl; uint64_t vers, readonly = B_FALSE; boolean_t freelist = B_FALSE, memory_err = B_TRUE; int retval = 0; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "Unable to split %s"), zhp->zpool_name); if (!zpool_name_valid(hdl, B_FALSE, newname)) return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); if ((config = zpool_get_config(zhp, NULL)) == NULL) { (void) fprintf(stderr, gettext("Internal error: unable to " "retrieve pool configuration\n")); return (-1); } tree = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); vers = fnvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION); if (props) { prop_flags_t flags = { .create = B_FALSE, .import = B_TRUE }; if ((zc_props = zpool_valid_proplist(hdl, zhp->zpool_name, props, vers, flags, errbuf)) == NULL) return (-1); (void) nvlist_lookup_uint64(zc_props, zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly); if (readonly) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "property %s can only be set at import time"), zpool_prop_to_name(ZPOOL_PROP_READONLY)); return (-1); } } if (nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Source pool is missing vdev tree")); nvlist_free(zc_props); return (-1); } varray = zfs_alloc(hdl, children * sizeof (nvlist_t *)); vcount = 0; if (*newroot == NULL || nvlist_lookup_nvlist_array(*newroot, ZPOOL_CONFIG_CHILDREN, &newchild, &newchildren) != 0) newchildren = 0; for (c = 0; c < children; c++) { uint64_t is_log = B_FALSE, is_hole = B_FALSE; boolean_t is_special = B_FALSE, is_dedup = B_FALSE; char *type; nvlist_t **mchild, *vdev; uint_t mchildren; int entry; /* * Unlike cache & spares, slogs are stored in the * ZPOOL_CONFIG_CHILDREN array. We filter them out here. */ (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, &is_log); (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, &is_hole); if (is_log || is_hole) { /* * Create a hole vdev and put it in the config. */ if (nvlist_alloc(&vdev, NV_UNIQUE_NAME, 0) != 0) goto out; if (nvlist_add_string(vdev, ZPOOL_CONFIG_TYPE, VDEV_TYPE_HOLE) != 0) goto out; if (nvlist_add_uint64(vdev, ZPOOL_CONFIG_IS_HOLE, 1) != 0) goto out; if (lastlog == 0) lastlog = vcount; varray[vcount++] = vdev; continue; } lastlog = 0; type = fnvlist_lookup_string(child[c], ZPOOL_CONFIG_TYPE); if (strcmp(type, VDEV_TYPE_INDIRECT) == 0) { vdev = child[c]; if (nvlist_dup(vdev, &varray[vcount++], 0) != 0) goto out; continue; } else if (strcmp(type, VDEV_TYPE_MIRROR) != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Source pool must be composed only of mirrors\n")); retval = zfs_error(hdl, EZFS_INVALCONFIG, errbuf); goto out; } if (nvlist_lookup_string(child[c], ZPOOL_CONFIG_ALLOCATION_BIAS, &bias) == 0) { if (strcmp(bias, VDEV_ALLOC_BIAS_SPECIAL) == 0) is_special = B_TRUE; else if (strcmp(bias, VDEV_ALLOC_BIAS_DEDUP) == 0) is_dedup = B_TRUE; } verify(nvlist_lookup_nvlist_array(child[c], ZPOOL_CONFIG_CHILDREN, &mchild, &mchildren) == 0); /* find or add an entry for this top-level vdev */ if (newchildren > 0 && (entry = find_vdev_entry(zhp, mchild, mchildren, newchild, newchildren)) >= 0) { /* We found a disk that the user specified. */ vdev = mchild[entry]; ++found; } else { /* User didn't specify a disk for this vdev. */ vdev = mchild[mchildren - 1]; } if (nvlist_dup(vdev, &varray[vcount++], 0) != 0) goto out; if (flags.dryrun != 0) { if (is_dedup == B_TRUE) { if (nvlist_add_string(varray[vcount - 1], ZPOOL_CONFIG_ALLOCATION_BIAS, VDEV_ALLOC_BIAS_DEDUP) != 0) goto out; } else if (is_special == B_TRUE) { if (nvlist_add_string(varray[vcount - 1], ZPOOL_CONFIG_ALLOCATION_BIAS, VDEV_ALLOC_BIAS_SPECIAL) != 0) goto out; } } } /* did we find every disk the user specified? */ if (found != newchildren) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Device list must " "include at most one disk from each mirror")); retval = zfs_error(hdl, EZFS_INVALCONFIG, errbuf); goto out; } /* Prepare the nvlist for populating. */ if (*newroot == NULL) { if (nvlist_alloc(newroot, NV_UNIQUE_NAME, 0) != 0) goto out; freelist = B_TRUE; if (nvlist_add_string(*newroot, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) != 0) goto out; } else { verify(nvlist_remove_all(*newroot, ZPOOL_CONFIG_CHILDREN) == 0); } /* Add all the children we found */ if (nvlist_add_nvlist_array(*newroot, ZPOOL_CONFIG_CHILDREN, (const nvlist_t **)varray, lastlog == 0 ? vcount : lastlog) != 0) goto out; /* * If we're just doing a dry run, exit now with success. */ if (flags.dryrun) { memory_err = B_FALSE; freelist = B_FALSE; goto out; } /* now build up the config list & call the ioctl */ if (nvlist_alloc(&newconfig, NV_UNIQUE_NAME, 0) != 0) goto out; if (nvlist_add_nvlist(newconfig, ZPOOL_CONFIG_VDEV_TREE, *newroot) != 0 || nvlist_add_string(newconfig, ZPOOL_CONFIG_POOL_NAME, newname) != 0 || nvlist_add_uint64(newconfig, ZPOOL_CONFIG_VERSION, vers) != 0) goto out; /* * The new pool is automatically part of the namespace unless we * explicitly export it. */ if (!flags.import) zc.zc_cookie = ZPOOL_EXPORT_AFTER_SPLIT; (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); (void) strlcpy(zc.zc_string, newname, sizeof (zc.zc_string)); zcmd_write_conf_nvlist(hdl, &zc, newconfig); if (zc_props != NULL) zcmd_write_src_nvlist(hdl, &zc, zc_props); if (zfs_ioctl(hdl, ZFS_IOC_VDEV_SPLIT, &zc) != 0) { retval = zpool_standard_error(hdl, errno, errbuf); goto out; } freelist = B_FALSE; memory_err = B_FALSE; out: if (varray != NULL) { int v; for (v = 0; v < vcount; v++) nvlist_free(varray[v]); free(varray); } zcmd_free_nvlists(&zc); nvlist_free(zc_props); nvlist_free(newconfig); if (freelist) { nvlist_free(*newroot); *newroot = NULL; } if (retval != 0) return (retval); if (memory_err) return (no_memory(hdl)); return (0); } /* * Remove the given device. */ int zpool_vdev_remove(zpool_handle_t *zhp, const char *path) { zfs_cmd_t zc = {"\0"}; char errbuf[ERRBUFLEN]; nvlist_t *tgt; boolean_t avail_spare, l2cache, islog; libzfs_handle_t *hdl = zhp->zpool_hdl; uint64_t version; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot remove %s"), path); if (zpool_is_draid_spare(path)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "dRAID spares cannot be removed")); return (zfs_error(hdl, EZFS_NODEVICE, errbuf)); } (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache, &islog)) == NULL) return (zfs_error(hdl, EZFS_NODEVICE, errbuf)); version = zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL); if (islog && version < SPA_VERSION_HOLES) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool must be upgraded to support log removal")); return (zfs_error(hdl, EZFS_BADVERSION, errbuf)); } zc.zc_guid = fnvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID); if (zfs_ioctl(hdl, ZFS_IOC_VDEV_REMOVE, &zc) == 0) return (0); switch (errno) { case EINVAL: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid config; all top-level vdevs must " "have the same sector size and not be raidz.")); (void) zfs_error(hdl, EZFS_INVALCONFIG, errbuf); break; case EBUSY: if (islog) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Mount encrypted datasets to replay logs.")); } else { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Pool busy; removal may already be in progress")); } (void) zfs_error(hdl, EZFS_BUSY, errbuf); break; case EACCES: if (islog) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Mount encrypted datasets to replay logs.")); (void) zfs_error(hdl, EZFS_BUSY, errbuf); } else { (void) zpool_standard_error(hdl, errno, errbuf); } break; default: (void) zpool_standard_error(hdl, errno, errbuf); } return (-1); } int zpool_vdev_remove_cancel(zpool_handle_t *zhp) { zfs_cmd_t zc = {{0}}; char errbuf[ERRBUFLEN]; libzfs_handle_t *hdl = zhp->zpool_hdl; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot cancel removal")); (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); zc.zc_cookie = 1; if (zfs_ioctl(hdl, ZFS_IOC_VDEV_REMOVE, &zc) == 0) return (0); return (zpool_standard_error(hdl, errno, errbuf)); } int zpool_vdev_indirect_size(zpool_handle_t *zhp, const char *path, uint64_t *sizep) { char errbuf[ERRBUFLEN]; nvlist_t *tgt; boolean_t avail_spare, l2cache, islog; libzfs_handle_t *hdl = zhp->zpool_hdl; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot determine indirect size of %s"), path); if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache, &islog)) == NULL) return (zfs_error(hdl, EZFS_NODEVICE, errbuf)); if (avail_spare || l2cache || islog) { *sizep = 0; return (0); } if (nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_INDIRECT_SIZE, sizep) != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "indirect size not available")); return (zfs_error(hdl, EINVAL, errbuf)); } return (0); } /* * Clear the errors for the pool, or the particular device if specified. */ int zpool_clear(zpool_handle_t *zhp, const char *path, nvlist_t *rewindnvl) { zfs_cmd_t zc = {"\0"}; char errbuf[ERRBUFLEN]; nvlist_t *tgt; zpool_load_policy_t policy; boolean_t avail_spare, l2cache; libzfs_handle_t *hdl = zhp->zpool_hdl; nvlist_t *nvi = NULL; int error; if (path) (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot clear errors for %s"), path); else (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot clear errors for %s"), zhp->zpool_name); (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); if (path) { if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache, NULL)) == NULL) return (zfs_error(hdl, EZFS_NODEVICE, errbuf)); /* * Don't allow error clearing for hot spares. Do allow * error clearing for l2cache devices. */ if (avail_spare) return (zfs_error(hdl, EZFS_ISSPARE, errbuf)); zc.zc_guid = fnvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID); } zpool_get_load_policy(rewindnvl, &policy); zc.zc_cookie = policy.zlp_rewind; zcmd_alloc_dst_nvlist(hdl, &zc, zhp->zpool_config_size * 2); zcmd_write_src_nvlist(hdl, &zc, rewindnvl); while ((error = zfs_ioctl(hdl, ZFS_IOC_CLEAR, &zc)) != 0 && errno == ENOMEM) zcmd_expand_dst_nvlist(hdl, &zc); if (!error || ((policy.zlp_rewind & ZPOOL_TRY_REWIND) && errno != EPERM && errno != EACCES)) { if (policy.zlp_rewind & (ZPOOL_DO_REWIND | ZPOOL_TRY_REWIND)) { (void) zcmd_read_dst_nvlist(hdl, &zc, &nvi); zpool_rewind_exclaim(hdl, zc.zc_name, ((policy.zlp_rewind & ZPOOL_TRY_REWIND) != 0), nvi); nvlist_free(nvi); } zcmd_free_nvlists(&zc); return (0); } zcmd_free_nvlists(&zc); return (zpool_standard_error(hdl, errno, errbuf)); } /* * Similar to zpool_clear(), but takes a GUID (used by fmd). */ int zpool_vdev_clear(zpool_handle_t *zhp, uint64_t guid) { zfs_cmd_t zc = {"\0"}; char errbuf[ERRBUFLEN]; libzfs_handle_t *hdl = zhp->zpool_hdl; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot clear errors for %llx"), (u_longlong_t)guid); (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); zc.zc_guid = guid; zc.zc_cookie = ZPOOL_NO_REWIND; if (zfs_ioctl(hdl, ZFS_IOC_CLEAR, &zc) == 0) return (0); return (zpool_standard_error(hdl, errno, errbuf)); } /* * Change the GUID for a pool. */ int zpool_reguid(zpool_handle_t *zhp) { char errbuf[ERRBUFLEN]; libzfs_handle_t *hdl = zhp->zpool_hdl; zfs_cmd_t zc = {"\0"}; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot reguid '%s'"), zhp->zpool_name); (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); if (zfs_ioctl(hdl, ZFS_IOC_POOL_REGUID, &zc) == 0) return (0); return (zpool_standard_error(hdl, errno, errbuf)); } /* * Reopen the pool. */ int zpool_reopen_one(zpool_handle_t *zhp, void *data) { libzfs_handle_t *hdl = zpool_get_handle(zhp); const char *pool_name = zpool_get_name(zhp); boolean_t *scrub_restart = data; int error; error = lzc_reopen(pool_name, *scrub_restart); if (error) { return (zpool_standard_error_fmt(hdl, error, dgettext(TEXT_DOMAIN, "cannot reopen '%s'"), pool_name)); } return (0); } /* call into libzfs_core to execute the sync IOCTL per pool */ int zpool_sync_one(zpool_handle_t *zhp, void *data) { int ret; libzfs_handle_t *hdl = zpool_get_handle(zhp); const char *pool_name = zpool_get_name(zhp); boolean_t *force = data; nvlist_t *innvl = fnvlist_alloc(); fnvlist_add_boolean_value(innvl, "force", *force); if ((ret = lzc_sync(pool_name, innvl, NULL)) != 0) { nvlist_free(innvl); return (zpool_standard_error_fmt(hdl, ret, dgettext(TEXT_DOMAIN, "sync '%s' failed"), pool_name)); } nvlist_free(innvl); return (0); } #define PATH_BUF_LEN 64 /* * Given a vdev, return the name to display in iostat. If the vdev has a path, * we use that, stripping off any leading "/dev/dsk/"; if not, we use the type. * We also check if this is a whole disk, in which case we strip off the * trailing 's0' slice name. * * This routine is also responsible for identifying when disks have been * reconfigured in a new location. The kernel will have opened the device by * devid, but the path will still refer to the old location. To catch this, we * first do a path -> devid translation (which is fast for the common case). If * the devid matches, we're done. If not, we do a reverse devid -> path * translation and issue the appropriate ioctl() to update the path of the vdev. * If 'zhp' is NULL, then this is an exported pool, and we don't need to do any * of these checks. */ char * zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv, int name_flags) { char *type, *tpath; const char *path; uint64_t value; char buf[PATH_BUF_LEN]; char tmpbuf[PATH_BUF_LEN * 2]; /* * vdev_name will be "root"/"root-0" for the root vdev, but it is the * zpool name that will be displayed to the user. */ type = fnvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE); if (zhp != NULL && strcmp(type, "root") == 0) return (zfs_strdup(hdl, zpool_get_name(zhp))); if (libzfs_envvar_is_set("ZPOOL_VDEV_NAME_PATH")) name_flags |= VDEV_NAME_PATH; if (libzfs_envvar_is_set("ZPOOL_VDEV_NAME_GUID")) name_flags |= VDEV_NAME_GUID; if (libzfs_envvar_is_set("ZPOOL_VDEV_NAME_FOLLOW_LINKS")) name_flags |= VDEV_NAME_FOLLOW_LINKS; if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, &value) == 0 || name_flags & VDEV_NAME_GUID) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &value); (void) snprintf(buf, sizeof (buf), "%llu", (u_longlong_t)value); path = buf; } else if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &tpath) == 0) { path = tpath; if (name_flags & VDEV_NAME_FOLLOW_LINKS) { char *rp = realpath(path, NULL); if (rp) { strlcpy(buf, rp, sizeof (buf)); path = buf; free(rp); } } /* * For a block device only use the name. */ if ((strcmp(type, VDEV_TYPE_DISK) == 0) && !(name_flags & VDEV_NAME_PATH)) { path = zfs_strip_path(path); } /* * Remove the partition from the path if this is a whole disk. */ if (strcmp(type, VDEV_TYPE_DRAID_SPARE) != 0 && nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, &value) == 0 && value && !(name_flags & VDEV_NAME_PATH)) { return (zfs_strip_partition(path)); } } else { path = type; /* * If it's a raidz device, we need to stick in the parity level. */ if (strcmp(path, VDEV_TYPE_RAIDZ) == 0) { value = fnvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY); (void) snprintf(buf, sizeof (buf), "%s%llu", path, (u_longlong_t)value); path = buf; } /* * If it's a dRAID device, we add parity, groups, and spares. */ if (strcmp(path, VDEV_TYPE_DRAID) == 0) { uint64_t ndata, nparity, nspares; nvlist_t **child; uint_t children; verify(nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) == 0); nparity = fnvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY); ndata = fnvlist_lookup_uint64(nv, ZPOOL_CONFIG_DRAID_NDATA); nspares = fnvlist_lookup_uint64(nv, ZPOOL_CONFIG_DRAID_NSPARES); path = zpool_draid_name(buf, sizeof (buf), ndata, nparity, nspares, children); } /* * We identify each top-level vdev by using a * naming convention. */ if (name_flags & VDEV_NAME_TYPE_ID) { uint64_t id = fnvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID); (void) snprintf(tmpbuf, sizeof (tmpbuf), "%s-%llu", path, (u_longlong_t)id); path = tmpbuf; } } return (zfs_strdup(hdl, path)); } static int zbookmark_mem_compare(const void *a, const void *b) { return (memcmp(a, b, sizeof (zbookmark_phys_t))); } /* * Retrieve the persistent error log, uniquify the members, and return to the * caller. */ int zpool_get_errlog(zpool_handle_t *zhp, nvlist_t **nverrlistp) { zfs_cmd_t zc = {"\0"}; libzfs_handle_t *hdl = zhp->zpool_hdl; uint64_t count; zbookmark_phys_t *zb = NULL; int i; /* * Retrieve the raw error list from the kernel. If the number of errors * has increased, allocate more space and continue until we get the * entire list. */ count = fnvlist_lookup_uint64(zhp->zpool_config, ZPOOL_CONFIG_ERRCOUNT); if (count == 0) return (0); zc.zc_nvlist_dst = (uintptr_t)zfs_alloc(zhp->zpool_hdl, count * sizeof (zbookmark_phys_t)); zc.zc_nvlist_dst_size = count; (void) strcpy(zc.zc_name, zhp->zpool_name); for (;;) { if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_ERROR_LOG, &zc) != 0) { free((void *)(uintptr_t)zc.zc_nvlist_dst); if (errno == ENOMEM) { void *dst; count = zc.zc_nvlist_dst_size; dst = zfs_alloc(zhp->zpool_hdl, count * sizeof (zbookmark_phys_t)); zc.zc_nvlist_dst = (uintptr_t)dst; } else { return (zpool_standard_error_fmt(hdl, errno, dgettext(TEXT_DOMAIN, "errors: List of " "errors unavailable"))); } } else { break; } } /* * Sort the resulting bookmarks. This is a little confusing due to the * implementation of ZFS_IOC_ERROR_LOG. The bookmarks are copied last * to first, and 'zc_nvlist_dst_size' indicates the number of bookmarks * _not_ copied as part of the process. So we point the start of our * array appropriate and decrement the total number of elements. */ zb = ((zbookmark_phys_t *)(uintptr_t)zc.zc_nvlist_dst) + zc.zc_nvlist_dst_size; count -= zc.zc_nvlist_dst_size; qsort(zb, count, sizeof (zbookmark_phys_t), zbookmark_mem_compare); verify(nvlist_alloc(nverrlistp, 0, KM_SLEEP) == 0); /* * Fill in the nverrlistp with nvlist's of dataset and object numbers. */ for (i = 0; i < count; i++) { nvlist_t *nv; /* ignoring zb_blkid and zb_level for now */ if (i > 0 && zb[i-1].zb_objset == zb[i].zb_objset && zb[i-1].zb_object == zb[i].zb_object) continue; if (nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) != 0) goto nomem; if (nvlist_add_uint64(nv, ZPOOL_ERR_DATASET, zb[i].zb_objset) != 0) { nvlist_free(nv); goto nomem; } if (nvlist_add_uint64(nv, ZPOOL_ERR_OBJECT, zb[i].zb_object) != 0) { nvlist_free(nv); goto nomem; } if (nvlist_add_nvlist(*nverrlistp, "ejk", nv) != 0) { nvlist_free(nv); goto nomem; } nvlist_free(nv); } free((void *)(uintptr_t)zc.zc_nvlist_dst); return (0); nomem: free((void *)(uintptr_t)zc.zc_nvlist_dst); return (no_memory(zhp->zpool_hdl)); } /* * Upgrade a ZFS pool to the latest on-disk version. */ int zpool_upgrade(zpool_handle_t *zhp, uint64_t new_version) { zfs_cmd_t zc = {"\0"}; libzfs_handle_t *hdl = zhp->zpool_hdl; (void) strcpy(zc.zc_name, zhp->zpool_name); zc.zc_cookie = new_version; if (zfs_ioctl(hdl, ZFS_IOC_POOL_UPGRADE, &zc) != 0) return (zpool_standard_error_fmt(hdl, errno, dgettext(TEXT_DOMAIN, "cannot upgrade '%s'"), zhp->zpool_name)); return (0); } void zfs_save_arguments(int argc, char **argv, char *string, int len) { int i; (void) strlcpy(string, zfs_basename(argv[0]), len); for (i = 1; i < argc; i++) { (void) strlcat(string, " ", len); (void) strlcat(string, argv[i], len); } } int zpool_log_history(libzfs_handle_t *hdl, const char *message) { zfs_cmd_t zc = {"\0"}; nvlist_t *args; args = fnvlist_alloc(); fnvlist_add_string(args, "message", message); zcmd_write_src_nvlist(hdl, &zc, args); int err = zfs_ioctl(hdl, ZFS_IOC_LOG_HISTORY, &zc); nvlist_free(args); zcmd_free_nvlists(&zc); return (err); } /* * Perform ioctl to get some command history of a pool. * * 'buf' is the buffer to fill up to 'len' bytes. 'off' is the * logical offset of the history buffer to start reading from. * * Upon return, 'off' is the next logical offset to read from and * 'len' is the actual amount of bytes read into 'buf'. */ static int get_history(zpool_handle_t *zhp, char *buf, uint64_t *off, uint64_t *len) { zfs_cmd_t zc = {"\0"}; libzfs_handle_t *hdl = zhp->zpool_hdl; (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); zc.zc_history = (uint64_t)(uintptr_t)buf; zc.zc_history_len = *len; zc.zc_history_offset = *off; if (zfs_ioctl(hdl, ZFS_IOC_POOL_GET_HISTORY, &zc) != 0) { switch (errno) { case EPERM: return (zfs_error_fmt(hdl, EZFS_PERM, dgettext(TEXT_DOMAIN, "cannot show history for pool '%s'"), zhp->zpool_name)); case ENOENT: return (zfs_error_fmt(hdl, EZFS_NOHISTORY, dgettext(TEXT_DOMAIN, "cannot get history for pool " "'%s'"), zhp->zpool_name)); case ENOTSUP: return (zfs_error_fmt(hdl, EZFS_BADVERSION, dgettext(TEXT_DOMAIN, "cannot get history for pool " "'%s', pool must be upgraded"), zhp->zpool_name)); default: return (zpool_standard_error_fmt(hdl, errno, dgettext(TEXT_DOMAIN, "cannot get history for '%s'"), zhp->zpool_name)); } } *len = zc.zc_history_len; *off = zc.zc_history_offset; return (0); } /* * Retrieve the command history of a pool. */ int zpool_get_history(zpool_handle_t *zhp, nvlist_t **nvhisp, uint64_t *off, boolean_t *eof) { libzfs_handle_t *hdl = zhp->zpool_hdl; char *buf; int buflen = 128 * 1024; nvlist_t **records = NULL; uint_t numrecords = 0; int err = 0, i; uint64_t start = *off; buf = zfs_alloc(hdl, buflen); /* process about 1MiB a time */ while (*off - start < 1024 * 1024) { uint64_t bytes_read = buflen; uint64_t leftover; if ((err = get_history(zhp, buf, off, &bytes_read)) != 0) break; /* if nothing else was read in, we're at EOF, just return */ if (!bytes_read) { *eof = B_TRUE; break; } if ((err = zpool_history_unpack(buf, bytes_read, &leftover, &records, &numrecords)) != 0) { zpool_standard_error_fmt(hdl, err, dgettext(TEXT_DOMAIN, "cannot get history for '%s'"), zhp->zpool_name); break; } *off -= leftover; if (leftover == bytes_read) { /* * no progress made, because buffer is not big enough * to hold this record; resize and retry. */ buflen *= 2; free(buf); buf = zfs_alloc(hdl, buflen); } } free(buf); if (!err) { *nvhisp = fnvlist_alloc(); fnvlist_add_nvlist_array(*nvhisp, ZPOOL_HIST_RECORD, (const nvlist_t **)records, numrecords); } for (i = 0; i < numrecords; i++) nvlist_free(records[i]); free(records); return (err); } /* * Retrieve the next event given the passed 'zevent_fd' file descriptor. * If there is a new event available 'nvp' will contain a newly allocated * nvlist and 'dropped' will be set to the number of missed events since * the last call to this function. When 'nvp' is set to NULL it indicates * no new events are available. In either case the function returns 0 and * it is up to the caller to free 'nvp'. In the case of a fatal error the * function will return a non-zero value. When the function is called in * blocking mode (the default, unless the ZEVENT_NONBLOCK flag is passed), * it will not return until a new event is available. */ int zpool_events_next(libzfs_handle_t *hdl, nvlist_t **nvp, int *dropped, unsigned flags, int zevent_fd) { zfs_cmd_t zc = {"\0"}; int error = 0; *nvp = NULL; *dropped = 0; zc.zc_cleanup_fd = zevent_fd; if (flags & ZEVENT_NONBLOCK) zc.zc_guid = ZEVENT_NONBLOCK; zcmd_alloc_dst_nvlist(hdl, &zc, ZEVENT_SIZE); retry: if (zfs_ioctl(hdl, ZFS_IOC_EVENTS_NEXT, &zc) != 0) { switch (errno) { case ESHUTDOWN: error = zfs_error_fmt(hdl, EZFS_POOLUNAVAIL, dgettext(TEXT_DOMAIN, "zfs shutdown")); goto out; case ENOENT: /* Blocking error case should not occur */ if (!(flags & ZEVENT_NONBLOCK)) error = zpool_standard_error_fmt(hdl, errno, dgettext(TEXT_DOMAIN, "cannot get event")); goto out; case ENOMEM: zcmd_expand_dst_nvlist(hdl, &zc); goto retry; default: error = zpool_standard_error_fmt(hdl, errno, dgettext(TEXT_DOMAIN, "cannot get event")); goto out; } } error = zcmd_read_dst_nvlist(hdl, &zc, nvp); if (error != 0) goto out; *dropped = (int)zc.zc_cookie; out: zcmd_free_nvlists(&zc); return (error); } /* * Clear all events. */ int zpool_events_clear(libzfs_handle_t *hdl, int *count) { zfs_cmd_t zc = {"\0"}; if (zfs_ioctl(hdl, ZFS_IOC_EVENTS_CLEAR, &zc) != 0) return (zpool_standard_error(hdl, errno, dgettext(TEXT_DOMAIN, "cannot clear events"))); if (count != NULL) *count = (int)zc.zc_cookie; /* # of events cleared */ return (0); } /* * Seek to a specific EID, ZEVENT_SEEK_START, or ZEVENT_SEEK_END for * the passed zevent_fd file handle. On success zero is returned, * otherwise -1 is returned and hdl->libzfs_error is set to the errno. */ int zpool_events_seek(libzfs_handle_t *hdl, uint64_t eid, int zevent_fd) { zfs_cmd_t zc = {"\0"}; int error = 0; zc.zc_guid = eid; zc.zc_cleanup_fd = zevent_fd; if (zfs_ioctl(hdl, ZFS_IOC_EVENTS_SEEK, &zc) != 0) { switch (errno) { case ENOENT: error = zfs_error_fmt(hdl, EZFS_NOENT, dgettext(TEXT_DOMAIN, "cannot get event")); break; case ENOMEM: error = zfs_error_fmt(hdl, EZFS_NOMEM, dgettext(TEXT_DOMAIN, "cannot get event")); break; default: error = zpool_standard_error_fmt(hdl, errno, dgettext(TEXT_DOMAIN, "cannot get event")); break; } } return (error); } static void zpool_obj_to_path_impl(zpool_handle_t *zhp, uint64_t dsobj, uint64_t obj, char *pathname, size_t len, boolean_t always_unmounted) { zfs_cmd_t zc = {"\0"}; boolean_t mounted = B_FALSE; char *mntpnt = NULL; char dsname[ZFS_MAX_DATASET_NAME_LEN]; if (dsobj == 0) { /* special case for the MOS */ (void) snprintf(pathname, len, ":<0x%llx>", (longlong_t)obj); return; } /* get the dataset's name */ (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); zc.zc_obj = dsobj; if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_DSOBJ_TO_DSNAME, &zc) != 0) { /* just write out a path of two object numbers */ (void) snprintf(pathname, len, "<0x%llx>:<0x%llx>", (longlong_t)dsobj, (longlong_t)obj); return; } (void) strlcpy(dsname, zc.zc_value, sizeof (dsname)); /* find out if the dataset is mounted */ mounted = !always_unmounted && is_mounted(zhp->zpool_hdl, dsname, &mntpnt); /* get the corrupted object's path */ (void) strlcpy(zc.zc_name, dsname, sizeof (zc.zc_name)); zc.zc_obj = obj; if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_OBJ_TO_PATH, &zc) == 0) { if (mounted) { (void) snprintf(pathname, len, "%s%s", mntpnt, zc.zc_value); } else { (void) snprintf(pathname, len, "%s:%s", dsname, zc.zc_value); } } else { (void) snprintf(pathname, len, "%s:<0x%llx>", dsname, (longlong_t)obj); } free(mntpnt); } void zpool_obj_to_path(zpool_handle_t *zhp, uint64_t dsobj, uint64_t obj, char *pathname, size_t len) { zpool_obj_to_path_impl(zhp, dsobj, obj, pathname, len, B_FALSE); } void zpool_obj_to_path_ds(zpool_handle_t *zhp, uint64_t dsobj, uint64_t obj, char *pathname, size_t len) { zpool_obj_to_path_impl(zhp, dsobj, obj, pathname, len, B_TRUE); } /* * Wait while the specified activity is in progress in the pool. */ int zpool_wait(zpool_handle_t *zhp, zpool_wait_activity_t activity) { boolean_t missing; int error = zpool_wait_status(zhp, activity, &missing, NULL); if (missing) { (void) zpool_standard_error_fmt(zhp->zpool_hdl, ENOENT, dgettext(TEXT_DOMAIN, "error waiting in pool '%s'"), zhp->zpool_name); return (ENOENT); } else { return (error); } } /* * Wait for the given activity and return the status of the wait (whether or not * any waiting was done) in the 'waited' parameter. Non-existent pools are * reported via the 'missing' parameter, rather than by printing an error * message. This is convenient when this function is called in a loop over a * long period of time (as it is, for example, by zpool's wait cmd). In that * scenario, a pool being exported or destroyed should be considered a normal * event, so we don't want to print an error when we find that the pool doesn't * exist. */ int zpool_wait_status(zpool_handle_t *zhp, zpool_wait_activity_t activity, boolean_t *missing, boolean_t *waited) { int error = lzc_wait(zhp->zpool_name, activity, waited); *missing = (error == ENOENT); if (*missing) return (0); if (error != 0) { (void) zpool_standard_error_fmt(zhp->zpool_hdl, error, dgettext(TEXT_DOMAIN, "error waiting in pool '%s'"), zhp->zpool_name); } return (error); } int zpool_set_bootenv(zpool_handle_t *zhp, const nvlist_t *envmap) { int error = lzc_set_bootenv(zhp->zpool_name, envmap); if (error != 0) { (void) zpool_standard_error_fmt(zhp->zpool_hdl, error, dgettext(TEXT_DOMAIN, "error setting bootenv in pool '%s'"), zhp->zpool_name); } return (error); } int zpool_get_bootenv(zpool_handle_t *zhp, nvlist_t **nvlp) { nvlist_t *nvl; int error; nvl = NULL; error = lzc_get_bootenv(zhp->zpool_name, &nvl); if (error != 0) { (void) zpool_standard_error_fmt(zhp->zpool_hdl, error, dgettext(TEXT_DOMAIN, "error getting bootenv in pool '%s'"), zhp->zpool_name); } else { *nvlp = nvl; } return (error); } /* * Attempt to read and parse feature file(s) (from "compatibility" property). * Files contain zpool feature names, comma or whitespace-separated. * Comments (# character to next newline) are discarded. * * Arguments: * compatibility : string containing feature filenames * features : either NULL or pointer to array of boolean * report : either NULL or pointer to string buffer * rlen : length of "report" buffer * * compatibility is NULL (unset), "", "off", "legacy", or list of * comma-separated filenames. filenames should either be absolute, * or relative to: * 1) ZPOOL_SYSCONF_COMPAT_D (eg: /etc/zfs/compatibility.d) or * 2) ZPOOL_DATA_COMPAT_D (eg: /usr/share/zfs/compatibility.d). * (Unset), "" or "off" => enable all features * "legacy" => disable all features * * Any feature names read from files which match unames in spa_feature_table * will have the corresponding boolean set in the features array (if non-NULL). * If more than one feature set specified, only features present in *all* of * them will be set. * * "report" if not NULL will be populated with a suitable status message. * * Return values: * ZPOOL_COMPATIBILITY_OK : files read and parsed ok * ZPOOL_COMPATIBILITY_BADFILE : file too big or not a text file * ZPOOL_COMPATIBILITY_BADTOKEN : SYSCONF file contains invalid feature name * ZPOOL_COMPATIBILITY_WARNTOKEN : DATA file contains invalid feature name * ZPOOL_COMPATIBILITY_NOFILES : no feature files found */ zpool_compat_status_t zpool_load_compat(const char *compat, boolean_t *features, char *report, size_t rlen) { int sdirfd, ddirfd, featfd; struct stat fs; char *fc; char *ps, *ls, *ws; char *file, *line, *word; char l_compat[ZFS_MAXPROPLEN]; boolean_t ret_nofiles = B_TRUE; boolean_t ret_badfile = B_FALSE; boolean_t ret_badtoken = B_FALSE; boolean_t ret_warntoken = B_FALSE; /* special cases (unset), "" and "off" => enable all features */ if (compat == NULL || compat[0] == '\0' || strcmp(compat, ZPOOL_COMPAT_OFF) == 0) { if (features != NULL) for (uint_t i = 0; i < SPA_FEATURES; i++) features[i] = B_TRUE; if (report != NULL) strlcpy(report, gettext("all features enabled"), rlen); return (ZPOOL_COMPATIBILITY_OK); } /* Final special case "legacy" => disable all features */ if (strcmp(compat, ZPOOL_COMPAT_LEGACY) == 0) { if (features != NULL) for (uint_t i = 0; i < SPA_FEATURES; i++) features[i] = B_FALSE; if (report != NULL) strlcpy(report, gettext("all features disabled"), rlen); return (ZPOOL_COMPATIBILITY_OK); } /* * Start with all true; will be ANDed with results from each file */ if (features != NULL) for (uint_t i = 0; i < SPA_FEATURES; i++) features[i] = B_TRUE; char err_badfile[ZFS_MAXPROPLEN] = ""; char err_badtoken[ZFS_MAXPROPLEN] = ""; /* * We ignore errors from the directory open() * as they're only needed if the filename is relative * which will be checked during the openat(). */ /* O_PATH safer than O_RDONLY if system allows it */ #if defined(O_PATH) #define ZC_DIR_FLAGS (O_DIRECTORY | O_CLOEXEC | O_PATH) #else #define ZC_DIR_FLAGS (O_DIRECTORY | O_CLOEXEC | O_RDONLY) #endif sdirfd = open(ZPOOL_SYSCONF_COMPAT_D, ZC_DIR_FLAGS); ddirfd = open(ZPOOL_DATA_COMPAT_D, ZC_DIR_FLAGS); (void) strlcpy(l_compat, compat, ZFS_MAXPROPLEN); for (file = strtok_r(l_compat, ",", &ps); file != NULL; file = strtok_r(NULL, ",", &ps)) { boolean_t l_features[SPA_FEATURES]; enum { Z_SYSCONF, Z_DATA } source; /* try sysconfdir first, then datadir */ source = Z_SYSCONF; if ((featfd = openat(sdirfd, file, O_RDONLY | O_CLOEXEC)) < 0) { featfd = openat(ddirfd, file, O_RDONLY | O_CLOEXEC); source = Z_DATA; } /* File readable and correct size? */ if (featfd < 0 || fstat(featfd, &fs) < 0 || fs.st_size < 1 || fs.st_size > ZPOOL_COMPAT_MAXSIZE) { (void) close(featfd); strlcat(err_badfile, file, ZFS_MAXPROPLEN); strlcat(err_badfile, " ", ZFS_MAXPROPLEN); ret_badfile = B_TRUE; continue; } /* Prefault the file if system allows */ #if defined(MAP_POPULATE) #define ZC_MMAP_FLAGS (MAP_PRIVATE | MAP_POPULATE) #elif defined(MAP_PREFAULT_READ) #define ZC_MMAP_FLAGS (MAP_PRIVATE | MAP_PREFAULT_READ) #else #define ZC_MMAP_FLAGS (MAP_PRIVATE) #endif /* private mmap() so we can strtok safely */ fc = (char *)mmap(NULL, fs.st_size, PROT_READ | PROT_WRITE, ZC_MMAP_FLAGS, featfd, 0); (void) close(featfd); /* map ok, and last character == newline? */ if (fc == MAP_FAILED || fc[fs.st_size - 1] != '\n') { (void) munmap((void *) fc, fs.st_size); strlcat(err_badfile, file, ZFS_MAXPROPLEN); strlcat(err_badfile, " ", ZFS_MAXPROPLEN); ret_badfile = B_TRUE; continue; } ret_nofiles = B_FALSE; for (uint_t i = 0; i < SPA_FEATURES; i++) l_features[i] = B_FALSE; /* replace final newline with NULL to ensure string ends */ fc[fs.st_size - 1] = '\0'; for (line = strtok_r(fc, "\n", &ls); line != NULL; line = strtok_r(NULL, "\n", &ls)) { /* discard comments */ char *r = strchr(line, '#'); if (r != NULL) *r = '\0'; for (word = strtok_r(line, ", \t", &ws); word != NULL; word = strtok_r(NULL, ", \t", &ws)) { /* Find matching feature name */ uint_t f; for (f = 0; f < SPA_FEATURES; f++) { zfeature_info_t *fi = &spa_feature_table[f]; if (strcmp(word, fi->fi_uname) == 0) { l_features[f] = B_TRUE; break; } } if (f < SPA_FEATURES) continue; /* found an unrecognized word */ /* lightly sanitize it */ if (strlen(word) > 32) word[32] = '\0'; for (char *c = word; *c != '\0'; c++) if (!isprint(*c)) *c = '?'; strlcat(err_badtoken, word, ZFS_MAXPROPLEN); strlcat(err_badtoken, " ", ZFS_MAXPROPLEN); if (source == Z_SYSCONF) ret_badtoken = B_TRUE; else ret_warntoken = B_TRUE; } } (void) munmap((void *) fc, fs.st_size); if (features != NULL) for (uint_t i = 0; i < SPA_FEATURES; i++) features[i] &= l_features[i]; } (void) close(sdirfd); (void) close(ddirfd); /* Return the most serious error */ if (ret_badfile) { if (report != NULL) snprintf(report, rlen, gettext("could not read/" "parse feature file(s): %s"), err_badfile); return (ZPOOL_COMPATIBILITY_BADFILE); } if (ret_nofiles) { if (report != NULL) strlcpy(report, gettext("no valid compatibility files specified"), rlen); return (ZPOOL_COMPATIBILITY_NOFILES); } if (ret_badtoken) { if (report != NULL) snprintf(report, rlen, gettext("invalid feature " "name(s) in local compatibility files: %s"), err_badtoken); return (ZPOOL_COMPATIBILITY_BADTOKEN); } if (ret_warntoken) { if (report != NULL) snprintf(report, rlen, gettext("unrecognized feature " "name(s) in distribution compatibility files: %s"), err_badtoken); return (ZPOOL_COMPATIBILITY_WARNTOKEN); } if (report != NULL) strlcpy(report, gettext("compatibility set ok"), rlen); return (ZPOOL_COMPATIBILITY_OK); } static int zpool_vdev_guid(zpool_handle_t *zhp, const char *vdevname, uint64_t *vdev_guid) { nvlist_t *tgt; boolean_t avail_spare, l2cache; verify(zhp != NULL); if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) { char errbuf[ERRBUFLEN]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "pool is in an unavailable state")); return (zfs_error(zhp->zpool_hdl, EZFS_POOLUNAVAIL, errbuf)); } if ((tgt = zpool_find_vdev(zhp, vdevname, &avail_spare, &l2cache, NULL)) == NULL) { char errbuf[ERRBUFLEN]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "can not find %s in %s"), vdevname, zhp->zpool_name); return (zfs_error(zhp->zpool_hdl, EZFS_NODEVICE, errbuf)); } *vdev_guid = fnvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID); return (0); } /* * Get a vdev property value for 'prop' and return the value in * a pre-allocated buffer. */ int zpool_get_vdev_prop_value(nvlist_t *nvprop, vdev_prop_t prop, char *prop_name, char *buf, size_t len, zprop_source_t *srctype, boolean_t literal) { nvlist_t *nv; const char *strval; uint64_t intval; zprop_source_t src = ZPROP_SRC_NONE; if (prop == VDEV_PROP_USERPROP) { /* user property, prop_name must contain the property name */ assert(prop_name != NULL); if (nvlist_lookup_nvlist(nvprop, prop_name, &nv) == 0) { src = fnvlist_lookup_uint64(nv, ZPROP_SOURCE); strval = fnvlist_lookup_string(nv, ZPROP_VALUE); } else { /* user prop not found */ return (-1); } (void) strlcpy(buf, strval, len); if (srctype) *srctype = src; return (0); } if (prop_name == NULL) prop_name = (char *)vdev_prop_to_name(prop); switch (vdev_prop_get_type(prop)) { case PROP_TYPE_STRING: if (nvlist_lookup_nvlist(nvprop, prop_name, &nv) == 0) { src = fnvlist_lookup_uint64(nv, ZPROP_SOURCE); strval = fnvlist_lookup_string(nv, ZPROP_VALUE); } else { src = ZPROP_SRC_DEFAULT; if ((strval = vdev_prop_default_string(prop)) == NULL) strval = "-"; } (void) strlcpy(buf, strval, len); break; case PROP_TYPE_NUMBER: if (nvlist_lookup_nvlist(nvprop, prop_name, &nv) == 0) { src = fnvlist_lookup_uint64(nv, ZPROP_SOURCE); intval = fnvlist_lookup_uint64(nv, ZPROP_VALUE); } else { src = ZPROP_SRC_DEFAULT; intval = vdev_prop_default_numeric(prop); } switch (prop) { case VDEV_PROP_ASIZE: case VDEV_PROP_PSIZE: case VDEV_PROP_SIZE: case VDEV_PROP_BOOTSIZE: case VDEV_PROP_ALLOCATED: case VDEV_PROP_FREE: case VDEV_PROP_READ_ERRORS: case VDEV_PROP_WRITE_ERRORS: case VDEV_PROP_CHECKSUM_ERRORS: case VDEV_PROP_INITIALIZE_ERRORS: case VDEV_PROP_OPS_NULL: case VDEV_PROP_OPS_READ: case VDEV_PROP_OPS_WRITE: case VDEV_PROP_OPS_FREE: case VDEV_PROP_OPS_CLAIM: case VDEV_PROP_OPS_TRIM: case VDEV_PROP_BYTES_NULL: case VDEV_PROP_BYTES_READ: case VDEV_PROP_BYTES_WRITE: case VDEV_PROP_BYTES_FREE: case VDEV_PROP_BYTES_CLAIM: case VDEV_PROP_BYTES_TRIM: if (literal) { (void) snprintf(buf, len, "%llu", (u_longlong_t)intval); } else { (void) zfs_nicenum(intval, buf, len); } break; case VDEV_PROP_EXPANDSZ: if (intval == 0) { (void) strlcpy(buf, "-", len); } else if (literal) { (void) snprintf(buf, len, "%llu", (u_longlong_t)intval); } else { (void) zfs_nicenum(intval, buf, len); } break; case VDEV_PROP_CAPACITY: if (literal) { (void) snprintf(buf, len, "%llu", (u_longlong_t)intval); } else { (void) snprintf(buf, len, "%llu%%", (u_longlong_t)intval); } break; case VDEV_PROP_FRAGMENTATION: if (intval == UINT64_MAX) { (void) strlcpy(buf, "-", len); } else { (void) snprintf(buf, len, "%llu%%", (u_longlong_t)intval); } break; case VDEV_PROP_STATE: if (literal) { (void) snprintf(buf, len, "%llu", (u_longlong_t)intval); } else { (void) strlcpy(buf, zpool_state_to_name(intval, VDEV_AUX_NONE), len); } break; default: (void) snprintf(buf, len, "%llu", (u_longlong_t)intval); } break; case PROP_TYPE_INDEX: if (nvlist_lookup_nvlist(nvprop, prop_name, &nv) == 0) { src = fnvlist_lookup_uint64(nv, ZPROP_SOURCE); intval = fnvlist_lookup_uint64(nv, ZPROP_VALUE); } else { src = ZPROP_SRC_DEFAULT; intval = vdev_prop_default_numeric(prop); } if (vdev_prop_index_to_string(prop, intval, (const char **)&strval) != 0) return (-1); (void) strlcpy(buf, strval, len); break; default: abort(); } if (srctype) *srctype = src; return (0); } /* * Get a vdev property value for 'prop_name' and return the value in * a pre-allocated buffer. */ int zpool_get_vdev_prop(zpool_handle_t *zhp, const char *vdevname, vdev_prop_t prop, char *prop_name, char *buf, size_t len, zprop_source_t *srctype, boolean_t literal) { nvlist_t *reqnvl, *reqprops; nvlist_t *retprops = NULL; uint64_t vdev_guid = 0; int ret; if ((ret = zpool_vdev_guid(zhp, vdevname, &vdev_guid)) != 0) return (ret); if (nvlist_alloc(&reqnvl, NV_UNIQUE_NAME, 0) != 0) return (no_memory(zhp->zpool_hdl)); if (nvlist_alloc(&reqprops, NV_UNIQUE_NAME, 0) != 0) return (no_memory(zhp->zpool_hdl)); fnvlist_add_uint64(reqnvl, ZPOOL_VDEV_PROPS_GET_VDEV, vdev_guid); if (prop != VDEV_PROP_USERPROP) { /* prop_name overrides prop value */ if (prop_name != NULL) prop = vdev_name_to_prop(prop_name); else prop_name = (char *)vdev_prop_to_name(prop); assert(prop < VDEV_NUM_PROPS); } assert(prop_name != NULL); if (nvlist_add_uint64(reqprops, prop_name, prop) != 0) { nvlist_free(reqnvl); nvlist_free(reqprops); return (no_memory(zhp->zpool_hdl)); } fnvlist_add_nvlist(reqnvl, ZPOOL_VDEV_PROPS_GET_PROPS, reqprops); ret = lzc_get_vdev_prop(zhp->zpool_name, reqnvl, &retprops); if (ret == 0) { ret = zpool_get_vdev_prop_value(retprops, prop, prop_name, buf, len, srctype, literal); } else { char errbuf[ERRBUFLEN]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot get vdev property %s from" " %s in %s"), prop_name, vdevname, zhp->zpool_name); (void) zpool_standard_error(zhp->zpool_hdl, ret, errbuf); } nvlist_free(reqnvl); nvlist_free(reqprops); nvlist_free(retprops); return (ret); } /* * Get all vdev properties */ int zpool_get_all_vdev_props(zpool_handle_t *zhp, const char *vdevname, nvlist_t **outnvl) { nvlist_t *nvl = NULL; uint64_t vdev_guid = 0; int ret; if ((ret = zpool_vdev_guid(zhp, vdevname, &vdev_guid)) != 0) return (ret); if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0) return (no_memory(zhp->zpool_hdl)); fnvlist_add_uint64(nvl, ZPOOL_VDEV_PROPS_GET_VDEV, vdev_guid); ret = lzc_get_vdev_prop(zhp->zpool_name, nvl, outnvl); nvlist_free(nvl); if (ret) { char errbuf[ERRBUFLEN]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot get vdev properties for" " %s in %s"), vdevname, zhp->zpool_name); (void) zpool_standard_error(zhp->zpool_hdl, errno, errbuf); } return (ret); } /* * Set vdev property */ int zpool_set_vdev_prop(zpool_handle_t *zhp, const char *vdevname, const char *propname, const char *propval) { int ret; nvlist_t *nvl = NULL; nvlist_t *outnvl = NULL; nvlist_t *props; nvlist_t *realprops; prop_flags_t flags = { 0 }; uint64_t version; uint64_t vdev_guid; if ((ret = zpool_vdev_guid(zhp, vdevname, &vdev_guid)) != 0) return (ret); if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0) return (no_memory(zhp->zpool_hdl)); if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) return (no_memory(zhp->zpool_hdl)); fnvlist_add_uint64(nvl, ZPOOL_VDEV_PROPS_SET_VDEV, vdev_guid); if (nvlist_add_string(props, propname, propval) != 0) { nvlist_free(props); return (no_memory(zhp->zpool_hdl)); } char errbuf[ERRBUFLEN]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot set property %s for %s on %s"), propname, vdevname, zhp->zpool_name); flags.vdevprop = 1; version = zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL); if ((realprops = zpool_valid_proplist(zhp->zpool_hdl, zhp->zpool_name, props, version, flags, errbuf)) == NULL) { nvlist_free(props); nvlist_free(nvl); return (-1); } nvlist_free(props); props = realprops; fnvlist_add_nvlist(nvl, ZPOOL_VDEV_PROPS_SET_PROPS, props); ret = lzc_set_vdev_prop(zhp->zpool_name, nvl, &outnvl); nvlist_free(props); nvlist_free(nvl); nvlist_free(outnvl); if (ret) (void) zpool_standard_error(zhp->zpool_hdl, errno, errbuf); return (ret); } diff --git a/lib/libzfs/libzfs_sendrecv.c b/lib/libzfs/libzfs_sendrecv.c index bf93ac9bac18..3e9f63777424 100644 --- a/lib/libzfs/libzfs_sendrecv.c +++ b/lib/libzfs/libzfs_sendrecv.c @@ -1,5471 +1,5471 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright (c) 2012, Joyent, Inc. All rights reserved. * Copyright (c) 2012 Pawel Jakub Dawidek . * All rights reserved * Copyright (c) 2013 Steven Hartland. All rights reserved. * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved. * Copyright 2016 Igor Kozhukhov * Copyright (c) 2018, loli10K . All rights reserved. * Copyright (c) 2019 Datto Inc. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zfs_namecheck.h" #include "zfs_prop.h" #include "zfs_fletcher.h" #include "libzfs_impl.h" #include #include #include #include #include #include #include static int zfs_receive_impl(libzfs_handle_t *, const char *, const char *, recvflags_t *, int, const char *, nvlist_t *, avl_tree_t *, char **, const char *, nvlist_t *); static int guid_to_name_redact_snaps(libzfs_handle_t *hdl, const char *parent, uint64_t guid, boolean_t bookmark_ok, uint64_t *redact_snap_guids, uint64_t num_redact_snaps, char *name); static int guid_to_name(libzfs_handle_t *, const char *, uint64_t, boolean_t, char *); typedef struct progress_arg { zfs_handle_t *pa_zhp; int pa_fd; boolean_t pa_parsable; boolean_t pa_estimate; int pa_verbosity; } progress_arg_t; static int dump_record(dmu_replay_record_t *drr, void *payload, size_t payload_len, zio_cksum_t *zc, int outfd) { ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t)); fletcher_4_incremental_native(drr, offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), zc); if (drr->drr_type != DRR_BEGIN) { ASSERT(ZIO_CHECKSUM_IS_ZERO(&drr->drr_u. drr_checksum.drr_checksum)); drr->drr_u.drr_checksum.drr_checksum = *zc; } fletcher_4_incremental_native(&drr->drr_u.drr_checksum.drr_checksum, sizeof (zio_cksum_t), zc); if (write(outfd, drr, sizeof (*drr)) == -1) return (errno); if (payload_len != 0) { fletcher_4_incremental_native(payload, payload_len, zc); if (write(outfd, payload, payload_len) == -1) return (errno); } return (0); } /* * Routines for dealing with the AVL tree of fs-nvlists */ typedef struct fsavl_node { avl_node_t fn_node; nvlist_t *fn_nvfs; char *fn_snapname; uint64_t fn_guid; } fsavl_node_t; static int fsavl_compare(const void *arg1, const void *arg2) { const fsavl_node_t *fn1 = (const fsavl_node_t *)arg1; const fsavl_node_t *fn2 = (const fsavl_node_t *)arg2; return (TREE_CMP(fn1->fn_guid, fn2->fn_guid)); } /* * Given the GUID of a snapshot, find its containing filesystem and * (optionally) name. */ static nvlist_t * fsavl_find(avl_tree_t *avl, uint64_t snapguid, char **snapname) { fsavl_node_t fn_find; fsavl_node_t *fn; fn_find.fn_guid = snapguid; fn = avl_find(avl, &fn_find, NULL); if (fn) { if (snapname) *snapname = fn->fn_snapname; return (fn->fn_nvfs); } return (NULL); } static void fsavl_destroy(avl_tree_t *avl) { fsavl_node_t *fn; void *cookie; if (avl == NULL) return; cookie = NULL; while ((fn = avl_destroy_nodes(avl, &cookie)) != NULL) free(fn); avl_destroy(avl); free(avl); } /* * Given an nvlist, produce an avl tree of snapshots, ordered by guid */ static avl_tree_t * fsavl_create(nvlist_t *fss) { avl_tree_t *fsavl; nvpair_t *fselem = NULL; if ((fsavl = malloc(sizeof (avl_tree_t))) == NULL) return (NULL); avl_create(fsavl, fsavl_compare, sizeof (fsavl_node_t), offsetof(fsavl_node_t, fn_node)); while ((fselem = nvlist_next_nvpair(fss, fselem)) != NULL) { nvlist_t *nvfs, *snaps; nvpair_t *snapelem = NULL; nvfs = fnvpair_value_nvlist(fselem); snaps = fnvlist_lookup_nvlist(nvfs, "snaps"); while ((snapelem = nvlist_next_nvpair(snaps, snapelem)) != NULL) { fsavl_node_t *fn; if ((fn = malloc(sizeof (fsavl_node_t))) == NULL) { fsavl_destroy(fsavl); return (NULL); } fn->fn_nvfs = nvfs; fn->fn_snapname = nvpair_name(snapelem); fn->fn_guid = fnvpair_value_uint64(snapelem); /* * Note: if there are multiple snaps with the * same GUID, we ignore all but one. */ avl_index_t where = 0; if (avl_find(fsavl, fn, &where) == NULL) avl_insert(fsavl, fn, where); else free(fn); } } return (fsavl); } /* * Routines for dealing with the giant nvlist of fs-nvlists, etc. */ typedef struct send_data { /* * assigned inside every recursive call, * restored from *_save on return: * * guid of fromsnap snapshot in parent dataset * txg of fromsnap snapshot in current dataset * txg of tosnap snapshot in current dataset */ uint64_t parent_fromsnap_guid; uint64_t fromsnap_txg; uint64_t tosnap_txg; /* the nvlists get accumulated during depth-first traversal */ nvlist_t *parent_snaps; nvlist_t *fss; nvlist_t *snapprops; nvlist_t *snapholds; /* user holds */ /* send-receive configuration, does not change during traversal */ const char *fsname; const char *fromsnap; const char *tosnap; boolean_t recursive; boolean_t raw; boolean_t doall; boolean_t replicate; boolean_t skipmissing; boolean_t verbose; boolean_t backup; boolean_t seenfrom; boolean_t seento; boolean_t holds; /* were holds requested with send -h */ boolean_t props; /* * The header nvlist is of the following format: * { * "tosnap" -> string * "fromsnap" -> string (if incremental) * "fss" -> { * id -> { * * "name" -> string (full name; for debugging) * "parentfromsnap" -> number (guid of fromsnap in parent) * * "props" -> { name -> value (only if set here) } * "snaps" -> { name (lastname) -> number (guid) } * "snapprops" -> { name (lastname) -> { name -> value } } * "snapholds" -> { name (lastname) -> { holdname -> crtime } } * * "origin" -> number (guid) (if clone) * "is_encroot" -> boolean * "sent" -> boolean (not on-disk) * } * } * } * */ } send_data_t; static void send_iterate_prop(zfs_handle_t *zhp, boolean_t received_only, nvlist_t *nv); /* * Collect guid, valid props, optionally holds, etc. of a snapshot. * This interface is intended for use as a zfs_iter_snapshots_sorted visitor. */ static int send_iterate_snap(zfs_handle_t *zhp, void *arg) { send_data_t *sd = arg; uint64_t guid = zhp->zfs_dmustats.dds_guid; uint64_t txg = zhp->zfs_dmustats.dds_creation_txg; boolean_t isfromsnap, istosnap, istosnapwithnofrom; char *snapname; const char *from = sd->fromsnap; const char *to = sd->tosnap; snapname = strrchr(zhp->zfs_name, '@'); assert(snapname != NULL); ++snapname; isfromsnap = (from != NULL && strcmp(from, snapname) == 0); istosnap = (to != NULL && strcmp(to, snapname) == 0); istosnapwithnofrom = (istosnap && from == NULL); if (sd->tosnap_txg != 0 && txg > sd->tosnap_txg) { if (sd->verbose) { (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "skipping snapshot %s because it was created " "after the destination snapshot (%s)\n"), zhp->zfs_name, to); } zfs_close(zhp); return (0); } fnvlist_add_uint64(sd->parent_snaps, snapname, guid); /* * NB: if there is no fromsnap here (it's a newly created fs in * an incremental replication), we will substitute the tosnap. */ if (isfromsnap || (sd->parent_fromsnap_guid == 0 && istosnap)) sd->parent_fromsnap_guid = guid; if (!sd->recursive) { /* * To allow a doall stream to work properly * with a NULL fromsnap */ if (sd->doall && from == NULL && !sd->seenfrom) sd->seenfrom = B_TRUE; if (!sd->seenfrom && isfromsnap) { sd->seenfrom = B_TRUE; zfs_close(zhp); return (0); } if ((sd->seento || !sd->seenfrom) && !istosnapwithnofrom) { zfs_close(zhp); return (0); } if (istosnap) sd->seento = B_TRUE; } nvlist_t *nv = fnvlist_alloc(); send_iterate_prop(zhp, sd->backup, nv); fnvlist_add_nvlist(sd->snapprops, snapname, nv); fnvlist_free(nv); if (sd->holds) { nvlist_t *holds; if (lzc_get_holds(zhp->zfs_name, &holds) == 0) { fnvlist_add_nvlist(sd->snapholds, snapname, holds); fnvlist_free(holds); } } zfs_close(zhp); return (0); } /* * Collect all valid props from the handle snap into an nvlist. */ static void send_iterate_prop(zfs_handle_t *zhp, boolean_t received_only, nvlist_t *nv) { nvlist_t *props; if (received_only) props = zfs_get_recvd_props(zhp); else props = zhp->zfs_props; nvpair_t *elem = NULL; while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { char *propname = nvpair_name(elem); zfs_prop_t prop = zfs_name_to_prop(propname); if (!zfs_prop_user(propname)) { /* * Realistically, this should never happen. However, * we want the ability to add DSL properties without * needing to make incompatible version changes. We * need to ignore unknown properties to allow older * software to still send datasets containing these * properties, with the unknown properties elided. */ if (prop == ZPROP_INVAL) continue; if (zfs_prop_readonly(prop)) continue; } nvlist_t *propnv = fnvpair_value_nvlist(elem); boolean_t isspacelimit = (prop == ZFS_PROP_QUOTA || prop == ZFS_PROP_RESERVATION || prop == ZFS_PROP_REFQUOTA || prop == ZFS_PROP_REFRESERVATION); if (isspacelimit && zhp->zfs_type == ZFS_TYPE_SNAPSHOT) continue; char *source; if (nvlist_lookup_string(propnv, ZPROP_SOURCE, &source) == 0) { if (strcmp(source, zhp->zfs_name) != 0 && strcmp(source, ZPROP_SOURCE_VAL_RECVD) != 0) continue; } else { /* * May have no source before SPA_VERSION_RECVD_PROPS, * but is still modifiable. */ if (!isspacelimit) continue; } if (zfs_prop_user(propname) || zfs_prop_get_type(prop) == PROP_TYPE_STRING) { char *value; value = fnvlist_lookup_string(propnv, ZPROP_VALUE); fnvlist_add_string(nv, propname, value); } else { uint64_t value; value = fnvlist_lookup_uint64(propnv, ZPROP_VALUE); fnvlist_add_uint64(nv, propname, value); } } } /* * returns snapshot guid * and returns 0 if the snapshot does not exist */ static uint64_t get_snap_guid(libzfs_handle_t *hdl, const char *fs, const char *snap) { char name[MAXPATHLEN + 1]; uint64_t guid = 0; if (fs == NULL || fs[0] == '\0' || snap == NULL || snap[0] == '\0') return (guid); (void) snprintf(name, sizeof (name), "%s@%s", fs, snap); zfs_handle_t *zhp = zfs_open(hdl, name, ZFS_TYPE_SNAPSHOT); if (zhp != NULL) { guid = zfs_prop_get_int(zhp, ZFS_PROP_GUID); zfs_close(zhp); } return (guid); } /* * returns snapshot creation txg * and returns 0 if the snapshot does not exist */ static uint64_t get_snap_txg(libzfs_handle_t *hdl, const char *fs, const char *snap) { char name[ZFS_MAX_DATASET_NAME_LEN]; uint64_t txg = 0; if (fs == NULL || fs[0] == '\0' || snap == NULL || snap[0] == '\0') return (txg); (void) snprintf(name, sizeof (name), "%s@%s", fs, snap); if (zfs_dataset_exists(hdl, name, ZFS_TYPE_SNAPSHOT)) { zfs_handle_t *zhp = zfs_open(hdl, name, ZFS_TYPE_SNAPSHOT); if (zhp != NULL) { txg = zfs_prop_get_int(zhp, ZFS_PROP_CREATETXG); zfs_close(zhp); } } return (txg); } /* * Recursively generate nvlists describing datasets. See comment * for the data structure send_data_t above for description of contents * of the nvlist. */ static int send_iterate_fs(zfs_handle_t *zhp, void *arg) { send_data_t *sd = arg; nvlist_t *nvfs = NULL, *nv = NULL; int rv = 0; uint64_t min_txg = 0, max_txg = 0; uint64_t txg = zhp->zfs_dmustats.dds_creation_txg; uint64_t guid = zhp->zfs_dmustats.dds_guid; uint64_t fromsnap_txg, tosnap_txg; char guidstring[64]; /* These fields are restored on return from a recursive call. */ uint64_t parent_fromsnap_guid_save = sd->parent_fromsnap_guid; uint64_t fromsnap_txg_save = sd->fromsnap_txg; uint64_t tosnap_txg_save = sd->tosnap_txg; fromsnap_txg = get_snap_txg(zhp->zfs_hdl, zhp->zfs_name, sd->fromsnap); if (fromsnap_txg != 0) sd->fromsnap_txg = fromsnap_txg; tosnap_txg = get_snap_txg(zhp->zfs_hdl, zhp->zfs_name, sd->tosnap); if (tosnap_txg != 0) sd->tosnap_txg = tosnap_txg; /* * On the send side, if the current dataset does not have tosnap, * perform two additional checks: * * - Skip sending the current dataset if it was created later than * the parent tosnap. * - Return error if the current dataset was created earlier than * the parent tosnap, unless --skip-missing specified. Then * just print a warning. */ if (sd->tosnap != NULL && tosnap_txg == 0) { if (sd->tosnap_txg != 0 && txg > sd->tosnap_txg) { if (sd->verbose) { (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "skipping dataset %s: snapshot %s does " "not exist\n"), zhp->zfs_name, sd->tosnap); } } else if (sd->skipmissing) { (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "WARNING: skipping dataset %s and its children:" " snapshot %s does not exist\n"), zhp->zfs_name, sd->tosnap); } else { (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "cannot send %s@%s%s: snapshot %s@%s does not " "exist\n"), sd->fsname, sd->tosnap, sd->recursive ? dgettext(TEXT_DOMAIN, " recursively") : "", zhp->zfs_name, sd->tosnap); rv = EZFS_NOENT; } goto out; } nvfs = fnvlist_alloc(); fnvlist_add_string(nvfs, "name", zhp->zfs_name); fnvlist_add_uint64(nvfs, "parentfromsnap", sd->parent_fromsnap_guid); if (zhp->zfs_dmustats.dds_origin[0] != '\0') { zfs_handle_t *origin = zfs_open(zhp->zfs_hdl, zhp->zfs_dmustats.dds_origin, ZFS_TYPE_SNAPSHOT); if (origin == NULL) { rv = -1; goto out; } fnvlist_add_uint64(nvfs, "origin", origin->zfs_dmustats.dds_guid); zfs_close(origin); } /* Iterate over props. */ if (sd->props || sd->backup || sd->recursive) { nv = fnvlist_alloc(); send_iterate_prop(zhp, sd->backup, nv); fnvlist_add_nvlist(nvfs, "props", nv); } if (zfs_prop_get_int(zhp, ZFS_PROP_ENCRYPTION) != ZIO_CRYPT_OFF) { boolean_t encroot; /* Determine if this dataset is an encryption root. */ if (zfs_crypto_get_encryption_root(zhp, &encroot, NULL) != 0) { rv = -1; goto out; } if (encroot) fnvlist_add_boolean(nvfs, "is_encroot"); /* * Encrypted datasets can only be sent with properties if * the raw flag is specified because the receive side doesn't * currently have a mechanism for recursively asking the user * for new encryption parameters. */ if (!sd->raw) { (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "cannot send %s@%s: encrypted dataset %s may not " "be sent with properties without the raw flag\n"), sd->fsname, sd->tosnap, zhp->zfs_name); rv = -1; goto out; } } /* * Iterate over snaps, and set sd->parent_fromsnap_guid. * * If this is a "doall" send, a replicate send or we're just trying * to gather a list of previous snapshots, iterate through all the * snaps in the txg range. Otherwise just look at the one we're * interested in. */ sd->parent_fromsnap_guid = 0; sd->parent_snaps = fnvlist_alloc(); sd->snapprops = fnvlist_alloc(); if (sd->holds) sd->snapholds = fnvlist_alloc(); if (sd->doall || sd->replicate || sd->tosnap == NULL) { if (!sd->replicate && fromsnap_txg != 0) min_txg = fromsnap_txg; if (!sd->replicate && tosnap_txg != 0) max_txg = tosnap_txg; (void) zfs_iter_snapshots_sorted(zhp, send_iterate_snap, sd, min_txg, max_txg); } else { char snapname[MAXPATHLEN]; zfs_handle_t *snap; (void) snprintf(snapname, sizeof (snapname), "%s@%s", zhp->zfs_name, sd->tosnap); if (sd->fromsnap != NULL) sd->seenfrom = B_TRUE; snap = zfs_open(zhp->zfs_hdl, snapname, ZFS_TYPE_SNAPSHOT); if (snap != NULL) (void) send_iterate_snap(snap, sd); } fnvlist_add_nvlist(nvfs, "snaps", sd->parent_snaps); fnvlist_free(sd->parent_snaps); fnvlist_add_nvlist(nvfs, "snapprops", sd->snapprops); fnvlist_free(sd->snapprops); if (sd->holds) { fnvlist_add_nvlist(nvfs, "snapholds", sd->snapholds); fnvlist_free(sd->snapholds); } /* Do not allow the size of the properties list to exceed the limit */ if ((fnvlist_size(nvfs) + fnvlist_size(sd->fss)) > zhp->zfs_hdl->libzfs_max_nvlist) { (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "warning: cannot send %s@%s: the size of the list of " "snapshots and properties is too large to be received " "successfully.\n" "Select a smaller number of snapshots to send.\n"), zhp->zfs_name, sd->tosnap); rv = EZFS_NOSPC; goto out; } /* Add this fs to nvlist. */ (void) snprintf(guidstring, sizeof (guidstring), "0x%llx", (longlong_t)guid); fnvlist_add_nvlist(sd->fss, guidstring, nvfs); /* Iterate over children. */ if (sd->recursive) rv = zfs_iter_filesystems(zhp, send_iterate_fs, sd); out: /* Restore saved fields. */ sd->parent_fromsnap_guid = parent_fromsnap_guid_save; sd->fromsnap_txg = fromsnap_txg_save; sd->tosnap_txg = tosnap_txg_save; fnvlist_free(nv); fnvlist_free(nvfs); zfs_close(zhp); return (rv); } static int gather_nvlist(libzfs_handle_t *hdl, const char *fsname, const char *fromsnap, const char *tosnap, boolean_t recursive, boolean_t raw, boolean_t doall, boolean_t replicate, boolean_t skipmissing, boolean_t verbose, boolean_t backup, boolean_t holds, boolean_t props, nvlist_t **nvlp, avl_tree_t **avlp) { zfs_handle_t *zhp; send_data_t sd = { 0 }; int error; zhp = zfs_open(hdl, fsname, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); if (zhp == NULL) return (EZFS_BADTYPE); sd.fss = fnvlist_alloc(); sd.fsname = fsname; sd.fromsnap = fromsnap; sd.tosnap = tosnap; sd.recursive = recursive; sd.raw = raw; sd.doall = doall; sd.replicate = replicate; sd.skipmissing = skipmissing; sd.verbose = verbose; sd.backup = backup; sd.holds = holds; sd.props = props; if ((error = send_iterate_fs(zhp, &sd)) != 0) { fnvlist_free(sd.fss); if (avlp != NULL) *avlp = NULL; *nvlp = NULL; return (error); } if (avlp != NULL && (*avlp = fsavl_create(sd.fss)) == NULL) { fnvlist_free(sd.fss); *nvlp = NULL; return (EZFS_NOMEM); } *nvlp = sd.fss; return (0); } /* * Routines specific to "zfs send" */ typedef struct send_dump_data { /* these are all just the short snapname (the part after the @) */ const char *fromsnap; const char *tosnap; char prevsnap[ZFS_MAX_DATASET_NAME_LEN]; uint64_t prevsnap_obj; boolean_t seenfrom, seento, replicate, doall, fromorigin; boolean_t dryrun, parsable, progress, embed_data, std_out; boolean_t large_block, compress, raw, holds; int outfd; boolean_t err; nvlist_t *fss; nvlist_t *snapholds; avl_tree_t *fsavl; snapfilter_cb_t *filter_cb; void *filter_cb_arg; nvlist_t *debugnv; char holdtag[ZFS_MAX_DATASET_NAME_LEN]; int cleanup_fd; int verbosity; uint64_t size; } send_dump_data_t; static int zfs_send_space(zfs_handle_t *zhp, const char *snapname, const char *from, enum lzc_send_flags flags, uint64_t *spacep) { assert(snapname != NULL); int error = lzc_send_space(snapname, from, flags, spacep); if (error == 0) return (0); char errbuf[ERRBUFLEN]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "warning: cannot estimate space for '%s'"), snapname); libzfs_handle_t *hdl = zhp->zfs_hdl; switch (error) { case EXDEV: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "not an earlier snapshot from the same fs")); return (zfs_error(hdl, EZFS_CROSSTARGET, errbuf)); case ENOENT: if (zfs_dataset_exists(hdl, snapname, ZFS_TYPE_SNAPSHOT)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "incremental source (%s) does not exist"), snapname); } return (zfs_error(hdl, EZFS_NOENT, errbuf)); case EDQUOT: case EFBIG: case EIO: case ENOLINK: case ENOSPC: case ENOSTR: case ENXIO: case EPIPE: case ERANGE: case EFAULT: case EROFS: case EINVAL: zfs_error_aux(hdl, "%s", strerror(error)); return (zfs_error(hdl, EZFS_BADBACKUP, errbuf)); default: return (zfs_standard_error(hdl, error, errbuf)); } } /* * Dumps a backup of the given snapshot (incremental from fromsnap if it's not * NULL) to the file descriptor specified by outfd. */ static int dump_ioctl(zfs_handle_t *zhp, const char *fromsnap, uint64_t fromsnap_obj, boolean_t fromorigin, int outfd, enum lzc_send_flags flags, nvlist_t *debugnv) { zfs_cmd_t zc = {"\0"}; libzfs_handle_t *hdl = zhp->zfs_hdl; nvlist_t *thisdbg; assert(zhp->zfs_type == ZFS_TYPE_SNAPSHOT); assert(fromsnap_obj == 0 || !fromorigin); (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); zc.zc_cookie = outfd; zc.zc_obj = fromorigin; zc.zc_sendobj = zfs_prop_get_int(zhp, ZFS_PROP_OBJSETID); zc.zc_fromobj = fromsnap_obj; zc.zc_flags = flags; if (debugnv != NULL) { thisdbg = fnvlist_alloc(); if (fromsnap != NULL && fromsnap[0] != '\0') fnvlist_add_string(thisdbg, "fromsnap", fromsnap); } if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_SEND, &zc) != 0) { char errbuf[ERRBUFLEN]; int error = errno; (void) snprintf(errbuf, sizeof (errbuf), "%s '%s'", dgettext(TEXT_DOMAIN, "warning: cannot send"), zhp->zfs_name); if (debugnv != NULL) { fnvlist_add_uint64(thisdbg, "error", error); fnvlist_add_nvlist(debugnv, zhp->zfs_name, thisdbg); fnvlist_free(thisdbg); } switch (error) { case EXDEV: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "not an earlier snapshot from the same fs")); return (zfs_error(hdl, EZFS_CROSSTARGET, errbuf)); case EACCES: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "source key must be loaded")); return (zfs_error(hdl, EZFS_CRYPTOFAILED, errbuf)); case ENOENT: if (zfs_dataset_exists(hdl, zc.zc_name, ZFS_TYPE_SNAPSHOT)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "incremental source (@%s) does not exist"), zc.zc_value); } return (zfs_error(hdl, EZFS_NOENT, errbuf)); case EDQUOT: case EFBIG: case EIO: case ENOLINK: case ENOSPC: case ENOSTR: case ENXIO: case EPIPE: case ERANGE: case EFAULT: case EROFS: case EINVAL: zfs_error_aux(hdl, "%s", strerror(errno)); return (zfs_error(hdl, EZFS_BADBACKUP, errbuf)); default: return (zfs_standard_error(hdl, errno, errbuf)); } } if (debugnv != NULL) { fnvlist_add_nvlist(debugnv, zhp->zfs_name, thisdbg); fnvlist_free(thisdbg); } return (0); } static void gather_holds(zfs_handle_t *zhp, send_dump_data_t *sdd) { assert(zhp->zfs_type == ZFS_TYPE_SNAPSHOT); /* * zfs_send() only sets snapholds for sends that need them, * e.g. replication and doall. */ if (sdd->snapholds == NULL) return; fnvlist_add_string(sdd->snapholds, zhp->zfs_name, sdd->holdtag); } int zfs_send_progress(zfs_handle_t *zhp, int fd, uint64_t *bytes_written, uint64_t *blocks_visited) { zfs_cmd_t zc = {"\0"}; if (bytes_written != NULL) *bytes_written = 0; if (blocks_visited != NULL) *blocks_visited = 0; (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); zc.zc_cookie = fd; if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_SEND_PROGRESS, &zc) != 0) return (errno); if (bytes_written != NULL) *bytes_written = zc.zc_cookie; if (blocks_visited != NULL) *blocks_visited = zc.zc_objset_type; return (0); } static void * send_progress_thread(void *arg) { progress_arg_t *pa = arg; zfs_handle_t *zhp = pa->pa_zhp; uint64_t bytes; uint64_t blocks; char buf[16]; time_t t; struct tm tm; int err; if (!pa->pa_parsable) { (void) fprintf(stderr, "TIME %s %sSNAPSHOT %s\n", pa->pa_estimate ? "BYTES" : " SENT", pa->pa_verbosity >= 2 ? " BLOCKS " : "", zhp->zfs_name); } /* * Print the progress from ZFS_IOC_SEND_PROGRESS every second. */ for (;;) { (void) sleep(1); if ((err = zfs_send_progress(zhp, pa->pa_fd, &bytes, &blocks)) != 0) { if (err == EINTR || err == ENOENT) return ((void *)0); return ((void *)(uintptr_t)err); } (void) time(&t); localtime_r(&t, &tm); if (pa->pa_verbosity >= 2 && pa->pa_parsable) { (void) fprintf(stderr, "%02d:%02d:%02d\t%llu\t%llu\t%s\n", tm.tm_hour, tm.tm_min, tm.tm_sec, (u_longlong_t)bytes, (u_longlong_t)blocks, zhp->zfs_name); } else if (pa->pa_verbosity >= 2) { zfs_nicenum(bytes, buf, sizeof (buf)); (void) fprintf(stderr, "%02d:%02d:%02d %5s %8llu %s\n", tm.tm_hour, tm.tm_min, tm.tm_sec, buf, (u_longlong_t)blocks, zhp->zfs_name); } else if (pa->pa_parsable) { (void) fprintf(stderr, "%02d:%02d:%02d\t%llu\t%s\n", tm.tm_hour, tm.tm_min, tm.tm_sec, (u_longlong_t)bytes, zhp->zfs_name); } else { zfs_nicebytes(bytes, buf, sizeof (buf)); (void) fprintf(stderr, "%02d:%02d:%02d %5s %s\n", tm.tm_hour, tm.tm_min, tm.tm_sec, buf, zhp->zfs_name); } } } static boolean_t send_progress_thread_exit(libzfs_handle_t *hdl, pthread_t ptid) { void *status = NULL; (void) pthread_cancel(ptid); (void) pthread_join(ptid, &status); int error = (int)(uintptr_t)status; if (error != 0 && status != PTHREAD_CANCELED) return (zfs_standard_error(hdl, error, dgettext(TEXT_DOMAIN, "progress thread exited nonzero"))); else return (B_FALSE); } static void send_print_verbose(FILE *fout, const char *tosnap, const char *fromsnap, uint64_t size, boolean_t parsable) { if (parsable) { if (fromsnap != NULL) { (void) fprintf(fout, dgettext(TEXT_DOMAIN, "incremental\t%s\t%s"), fromsnap, tosnap); } else { (void) fprintf(fout, dgettext(TEXT_DOMAIN, "full\t%s"), tosnap); } (void) fprintf(fout, "\t%llu", (longlong_t)size); } else { if (fromsnap != NULL) { if (strchr(fromsnap, '@') == NULL && strchr(fromsnap, '#') == NULL) { (void) fprintf(fout, dgettext(TEXT_DOMAIN, "send from @%s to %s"), fromsnap, tosnap); } else { (void) fprintf(fout, dgettext(TEXT_DOMAIN, "send from %s to %s"), fromsnap, tosnap); } } else { (void) fprintf(fout, dgettext(TEXT_DOMAIN, "full send of %s"), tosnap); } if (size != 0) { char buf[16]; zfs_nicebytes(size, buf, sizeof (buf)); (void) fprintf(fout, dgettext(TEXT_DOMAIN, " estimated size is %s"), buf); } } (void) fprintf(fout, "\n"); } /* * Send a single filesystem snapshot, updating the send dump data. * This interface is intended for use as a zfs_iter_snapshots_sorted visitor. */ static int dump_snapshot(zfs_handle_t *zhp, void *arg) { send_dump_data_t *sdd = arg; progress_arg_t pa = { 0 }; pthread_t tid; char *thissnap; enum lzc_send_flags flags = 0; int err; boolean_t isfromsnap, istosnap, fromorigin; boolean_t exclude = B_FALSE; FILE *fout = sdd->std_out ? stdout : stderr; err = 0; thissnap = strchr(zhp->zfs_name, '@') + 1; isfromsnap = (sdd->fromsnap != NULL && strcmp(sdd->fromsnap, thissnap) == 0); if (!sdd->seenfrom && isfromsnap) { gather_holds(zhp, sdd); sdd->seenfrom = B_TRUE; (void) strlcpy(sdd->prevsnap, thissnap, sizeof (sdd->prevsnap)); sdd->prevsnap_obj = zfs_prop_get_int(zhp, ZFS_PROP_OBJSETID); zfs_close(zhp); return (0); } if (sdd->seento || !sdd->seenfrom) { zfs_close(zhp); return (0); } istosnap = (strcmp(sdd->tosnap, thissnap) == 0); if (istosnap) sdd->seento = B_TRUE; if (sdd->large_block) flags |= LZC_SEND_FLAG_LARGE_BLOCK; if (sdd->embed_data) flags |= LZC_SEND_FLAG_EMBED_DATA; if (sdd->compress) flags |= LZC_SEND_FLAG_COMPRESS; if (sdd->raw) flags |= LZC_SEND_FLAG_RAW; if (!sdd->doall && !isfromsnap && !istosnap) { if (sdd->replicate) { char *snapname; nvlist_t *snapprops; /* * Filter out all intermediate snapshots except origin * snapshots needed to replicate clones. */ nvlist_t *nvfs = fsavl_find(sdd->fsavl, zhp->zfs_dmustats.dds_guid, &snapname); if (nvfs != NULL) { snapprops = fnvlist_lookup_nvlist(nvfs, "snapprops"); snapprops = fnvlist_lookup_nvlist(snapprops, thissnap); exclude = !nvlist_exists(snapprops, "is_clone_origin"); } } else { exclude = B_TRUE; } } /* * If a filter function exists, call it to determine whether * this snapshot will be sent. */ if (exclude || (sdd->filter_cb != NULL && sdd->filter_cb(zhp, sdd->filter_cb_arg) == B_FALSE)) { /* * This snapshot is filtered out. Don't send it, and don't * set prevsnap_obj, so it will be as if this snapshot didn't * exist, and the next accepted snapshot will be sent as * an incremental from the last accepted one, or as the * first (and full) snapshot in the case of a replication, * non-incremental send. */ zfs_close(zhp); return (0); } gather_holds(zhp, sdd); fromorigin = sdd->prevsnap[0] == '\0' && (sdd->fromorigin || sdd->replicate); if (sdd->verbosity != 0) { uint64_t size = 0; char fromds[ZFS_MAX_DATASET_NAME_LEN]; if (sdd->prevsnap[0] != '\0') { (void) strlcpy(fromds, zhp->zfs_name, sizeof (fromds)); *(strchr(fromds, '@') + 1) = '\0'; (void) strlcat(fromds, sdd->prevsnap, sizeof (fromds)); } if (zfs_send_space(zhp, zhp->zfs_name, sdd->prevsnap[0] ? fromds : NULL, flags, &size) == 0) { send_print_verbose(fout, zhp->zfs_name, sdd->prevsnap[0] ? sdd->prevsnap : NULL, size, sdd->parsable); sdd->size += size; } } if (!sdd->dryrun) { /* * If progress reporting is requested, spawn a new thread to * poll ZFS_IOC_SEND_PROGRESS at a regular interval. */ if (sdd->progress) { pa.pa_zhp = zhp; pa.pa_fd = sdd->outfd; pa.pa_parsable = sdd->parsable; pa.pa_estimate = B_FALSE; pa.pa_verbosity = sdd->verbosity; if ((err = pthread_create(&tid, NULL, send_progress_thread, &pa)) != 0) { zfs_close(zhp); return (err); } } err = dump_ioctl(zhp, sdd->prevsnap, sdd->prevsnap_obj, fromorigin, sdd->outfd, flags, sdd->debugnv); if (sdd->progress && send_progress_thread_exit(zhp->zfs_hdl, tid)) return (-1); } (void) strlcpy(sdd->prevsnap, thissnap, sizeof (sdd->prevsnap)); sdd->prevsnap_obj = zfs_prop_get_int(zhp, ZFS_PROP_OBJSETID); zfs_close(zhp); return (err); } /* * Send all snapshots for a filesystem, updating the send dump data. */ static int dump_filesystem(zfs_handle_t *zhp, send_dump_data_t *sdd) { int rv = 0; boolean_t missingfrom = B_FALSE; zfs_cmd_t zc = {"\0"}; uint64_t min_txg = 0, max_txg = 0; /* * Make sure the tosnap exists. */ (void) snprintf(zc.zc_name, sizeof (zc.zc_name), "%s@%s", zhp->zfs_name, sdd->tosnap); if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_OBJSET_STATS, &zc) != 0) { (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "WARNING: could not send %s@%s: does not exist\n"), zhp->zfs_name, sdd->tosnap); sdd->err = B_TRUE; return (0); } /* * If this fs does not have fromsnap, and we're doing * recursive, we need to send a full stream from the * beginning (or an incremental from the origin if this * is a clone). If we're doing non-recursive, then let * them get the error. */ if (sdd->replicate && sdd->fromsnap) { /* * Make sure the fromsnap exists. */ (void) snprintf(zc.zc_name, sizeof (zc.zc_name), "%s@%s", zhp->zfs_name, sdd->fromsnap); if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_OBJSET_STATS, &zc) != 0) missingfrom = B_TRUE; } sdd->seenfrom = sdd->seento = B_FALSE; sdd->prevsnap[0] = '\0'; sdd->prevsnap_obj = 0; if (sdd->fromsnap == NULL || missingfrom) sdd->seenfrom = B_TRUE; /* * Iterate through all snapshots and process the ones we will be * sending. If we only have a "from" and "to" snapshot to deal * with, we can avoid iterating through all the other snapshots. */ if (sdd->doall || sdd->replicate || sdd->tosnap == NULL) { if (!sdd->replicate) { if (sdd->fromsnap != NULL) { min_txg = get_snap_txg(zhp->zfs_hdl, zhp->zfs_name, sdd->fromsnap); } if (sdd->tosnap != NULL) { max_txg = get_snap_txg(zhp->zfs_hdl, zhp->zfs_name, sdd->tosnap); } } rv = zfs_iter_snapshots_sorted(zhp, dump_snapshot, sdd, min_txg, max_txg); } else { char snapname[MAXPATHLEN] = { 0 }; zfs_handle_t *snap; /* Dump fromsnap. */ if (!sdd->seenfrom) { (void) snprintf(snapname, sizeof (snapname), "%s@%s", zhp->zfs_name, sdd->fromsnap); snap = zfs_open(zhp->zfs_hdl, snapname, ZFS_TYPE_SNAPSHOT); if (snap != NULL) rv = dump_snapshot(snap, sdd); else rv = -1; } /* Dump tosnap. */ if (rv == 0) { (void) snprintf(snapname, sizeof (snapname), "%s@%s", zhp->zfs_name, sdd->tosnap); snap = zfs_open(zhp->zfs_hdl, snapname, ZFS_TYPE_SNAPSHOT); if (snap != NULL) rv = dump_snapshot(snap, sdd); else rv = -1; } } if (!sdd->seenfrom) { (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "WARNING: could not send %s@%s:\n" "incremental source (%s@%s) does not exist\n"), zhp->zfs_name, sdd->tosnap, zhp->zfs_name, sdd->fromsnap); sdd->err = B_TRUE; } else if (!sdd->seento) { if (sdd->fromsnap) { (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "WARNING: could not send %s@%s:\n" "incremental source (%s@%s) " "is not earlier than it\n"), zhp->zfs_name, sdd->tosnap, zhp->zfs_name, sdd->fromsnap); } else { (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "WARNING: " "could not send %s@%s: does not exist\n"), zhp->zfs_name, sdd->tosnap); } sdd->err = B_TRUE; } return (rv); } /* * Send all snapshots for all filesystems in sdd. */ static int dump_filesystems(zfs_handle_t *rzhp, send_dump_data_t *sdd) { nvpair_t *fspair; boolean_t needagain, progress; if (!sdd->replicate) return (dump_filesystem(rzhp, sdd)); /* Mark the clone origin snapshots. */ for (fspair = nvlist_next_nvpair(sdd->fss, NULL); fspair; fspair = nvlist_next_nvpair(sdd->fss, fspair)) { nvlist_t *nvfs; uint64_t origin_guid = 0; nvfs = fnvpair_value_nvlist(fspair); (void) nvlist_lookup_uint64(nvfs, "origin", &origin_guid); if (origin_guid != 0) { char *snapname; nvlist_t *origin_nv = fsavl_find(sdd->fsavl, origin_guid, &snapname); if (origin_nv != NULL) { nvlist_t *snapprops; snapprops = fnvlist_lookup_nvlist(origin_nv, "snapprops"); snapprops = fnvlist_lookup_nvlist(snapprops, snapname); fnvlist_add_boolean(snapprops, "is_clone_origin"); } } } again: needagain = progress = B_FALSE; for (fspair = nvlist_next_nvpair(sdd->fss, NULL); fspair; fspair = nvlist_next_nvpair(sdd->fss, fspair)) { nvlist_t *fslist, *parent_nv; char *fsname; zfs_handle_t *zhp; int err; uint64_t origin_guid = 0; uint64_t parent_guid = 0; fslist = fnvpair_value_nvlist(fspair); if (nvlist_lookup_boolean(fslist, "sent") == 0) continue; fsname = fnvlist_lookup_string(fslist, "name"); (void) nvlist_lookup_uint64(fslist, "origin", &origin_guid); (void) nvlist_lookup_uint64(fslist, "parentfromsnap", &parent_guid); if (parent_guid != 0) { parent_nv = fsavl_find(sdd->fsavl, parent_guid, NULL); if (!nvlist_exists(parent_nv, "sent")) { /* Parent has not been sent; skip this one. */ needagain = B_TRUE; continue; } } if (origin_guid != 0) { nvlist_t *origin_nv = fsavl_find(sdd->fsavl, origin_guid, NULL); if (origin_nv != NULL && !nvlist_exists(origin_nv, "sent")) { /* * Origin has not been sent yet; * skip this clone. */ needagain = B_TRUE; continue; } } zhp = zfs_open(rzhp->zfs_hdl, fsname, ZFS_TYPE_DATASET); if (zhp == NULL) return (-1); err = dump_filesystem(zhp, sdd); fnvlist_add_boolean(fslist, "sent"); progress = B_TRUE; zfs_close(zhp); if (err) return (err); } if (needagain) { assert(progress); goto again; } /* Clean out the sent flags in case we reuse this fss. */ for (fspair = nvlist_next_nvpair(sdd->fss, NULL); fspair; fspair = nvlist_next_nvpair(sdd->fss, fspair)) { nvlist_t *fslist; fslist = fnvpair_value_nvlist(fspair); (void) nvlist_remove_all(fslist, "sent"); } return (0); } nvlist_t * zfs_send_resume_token_to_nvlist(libzfs_handle_t *hdl, const char *token) { unsigned int version; int nread, i; unsigned long long checksum, packed_len; /* * Decode token header, which is: * -- * Note that the only supported token version is 1. */ nread = sscanf(token, "%u-%llx-%llx-", &version, &checksum, &packed_len); if (nread != 3) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "resume token is corrupt (invalid format)")); return (NULL); } if (version != ZFS_SEND_RESUME_TOKEN_VERSION) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "resume token is corrupt (invalid version %u)"), version); return (NULL); } /* Convert hexadecimal representation to binary. */ token = strrchr(token, '-') + 1; int len = strlen(token) / 2; unsigned char *compressed = zfs_alloc(hdl, len); for (i = 0; i < len; i++) { nread = sscanf(token + i * 2, "%2hhx", compressed + i); if (nread != 1) { free(compressed); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "resume token is corrupt " "(payload is not hex-encoded)")); return (NULL); } } /* Verify checksum. */ zio_cksum_t cksum; fletcher_4_native_varsize(compressed, len, &cksum); if (cksum.zc_word[0] != checksum) { free(compressed); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "resume token is corrupt (incorrect checksum)")); return (NULL); } /* Uncompress. */ void *packed = zfs_alloc(hdl, packed_len); uLongf packed_len_long = packed_len; if (uncompress(packed, &packed_len_long, compressed, len) != Z_OK || packed_len_long != packed_len) { free(packed); free(compressed); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "resume token is corrupt (decompression failed)")); return (NULL); } /* Unpack nvlist. */ nvlist_t *nv; int error = nvlist_unpack(packed, packed_len, &nv, KM_SLEEP); free(packed); free(compressed); if (error != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "resume token is corrupt (nvlist_unpack failed)")); return (NULL); } return (nv); } static enum lzc_send_flags lzc_flags_from_sendflags(const sendflags_t *flags) { enum lzc_send_flags lzc_flags = 0; if (flags->largeblock) lzc_flags |= LZC_SEND_FLAG_LARGE_BLOCK; if (flags->embed_data) lzc_flags |= LZC_SEND_FLAG_EMBED_DATA; if (flags->compress) lzc_flags |= LZC_SEND_FLAG_COMPRESS; if (flags->raw) lzc_flags |= LZC_SEND_FLAG_RAW; if (flags->saved) lzc_flags |= LZC_SEND_FLAG_SAVED; return (lzc_flags); } static int estimate_size(zfs_handle_t *zhp, const char *from, int fd, sendflags_t *flags, uint64_t resumeobj, uint64_t resumeoff, uint64_t bytes, const char *redactbook, char *errbuf) { uint64_t size; FILE *fout = flags->dryrun ? stdout : stderr; progress_arg_t pa = { 0 }; int err = 0; pthread_t ptid; if (flags->progress) { pa.pa_zhp = zhp; pa.pa_fd = fd; pa.pa_parsable = flags->parsable; pa.pa_estimate = B_TRUE; pa.pa_verbosity = flags->verbosity; err = pthread_create(&ptid, NULL, send_progress_thread, &pa); if (err != 0) { zfs_error_aux(zhp->zfs_hdl, "%s", strerror(errno)); return (zfs_error(zhp->zfs_hdl, EZFS_THREADCREATEFAILED, errbuf)); } } err = lzc_send_space_resume_redacted(zhp->zfs_name, from, lzc_flags_from_sendflags(flags), resumeobj, resumeoff, bytes, redactbook, fd, &size); if (flags->progress && send_progress_thread_exit(zhp->zfs_hdl, ptid)) return (-1); if (err != 0) { zfs_error_aux(zhp->zfs_hdl, "%s", strerror(err)); return (zfs_error(zhp->zfs_hdl, EZFS_BADBACKUP, errbuf)); } send_print_verbose(fout, zhp->zfs_name, from, size, flags->parsable); if (flags->parsable) { (void) fprintf(fout, "size\t%llu\n", (longlong_t)size); } else { char buf[16]; zfs_nicenum(size, buf, sizeof (buf)); (void) fprintf(fout, dgettext(TEXT_DOMAIN, "total estimated size is %s\n"), buf); } return (0); } static boolean_t redact_snaps_contains(const uint64_t *snaps, uint64_t num_snaps, uint64_t guid) { for (int i = 0; i < num_snaps; i++) { if (snaps[i] == guid) return (B_TRUE); } return (B_FALSE); } static boolean_t redact_snaps_equal(const uint64_t *snaps1, uint64_t num_snaps1, const uint64_t *snaps2, uint64_t num_snaps2) { if (num_snaps1 != num_snaps2) return (B_FALSE); for (int i = 0; i < num_snaps1; i++) { if (!redact_snaps_contains(snaps2, num_snaps2, snaps1[i])) return (B_FALSE); } return (B_TRUE); } static int get_bookmarks(const char *path, nvlist_t **bmarksp) { nvlist_t *props = fnvlist_alloc(); int error; fnvlist_add_boolean(props, "redact_complete"); fnvlist_add_boolean(props, zfs_prop_to_name(ZFS_PROP_REDACT_SNAPS)); error = lzc_get_bookmarks(path, props, bmarksp); fnvlist_free(props); return (error); } static nvpair_t * find_redact_pair(nvlist_t *bmarks, const uint64_t *redact_snap_guids, int num_redact_snaps) { nvpair_t *pair; for (pair = nvlist_next_nvpair(bmarks, NULL); pair; pair = nvlist_next_nvpair(bmarks, pair)) { nvlist_t *bmark = fnvpair_value_nvlist(pair); nvlist_t *vallist = fnvlist_lookup_nvlist(bmark, zfs_prop_to_name(ZFS_PROP_REDACT_SNAPS)); uint_t len = 0; uint64_t *bmarksnaps = fnvlist_lookup_uint64_array(vallist, ZPROP_VALUE, &len); if (redact_snaps_equal(redact_snap_guids, num_redact_snaps, bmarksnaps, len)) { break; } } return (pair); } static boolean_t get_redact_complete(nvpair_t *pair) { nvlist_t *bmark = fnvpair_value_nvlist(pair); nvlist_t *vallist = fnvlist_lookup_nvlist(bmark, "redact_complete"); boolean_t complete = fnvlist_lookup_boolean_value(vallist, ZPROP_VALUE); return (complete); } /* * Check that the list of redaction snapshots in the bookmark matches the send * we're resuming, and return whether or not it's complete. * * Note that the caller needs to free the contents of *bookname with free() if * this function returns successfully. */ static int find_redact_book(libzfs_handle_t *hdl, const char *path, const uint64_t *redact_snap_guids, int num_redact_snaps, char **bookname) { char errbuf[ERRBUFLEN]; nvlist_t *bmarks; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot resume send")); int error = get_bookmarks(path, &bmarks); if (error != 0) { if (error == ESRCH) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "nonexistent redaction bookmark provided")); } else if (error == ENOENT) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "dataset to be sent no longer exists")); } else { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "unknown error: %s"), strerror(error)); } return (zfs_error(hdl, EZFS_BADPROP, errbuf)); } nvpair_t *pair = find_redact_pair(bmarks, redact_snap_guids, num_redact_snaps); if (pair == NULL) { fnvlist_free(bmarks); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "no appropriate redaction bookmark exists")); return (zfs_error(hdl, EZFS_BADPROP, errbuf)); } boolean_t complete = get_redact_complete(pair); if (!complete) { fnvlist_free(bmarks); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "incomplete redaction bookmark provided")); return (zfs_error(hdl, EZFS_BADPROP, errbuf)); } *bookname = strndup(nvpair_name(pair), ZFS_MAX_DATASET_NAME_LEN); ASSERT3P(*bookname, !=, NULL); fnvlist_free(bmarks); return (0); } static enum lzc_send_flags lzc_flags_from_resume_nvl(nvlist_t *resume_nvl) { enum lzc_send_flags lzc_flags = 0; if (nvlist_exists(resume_nvl, "largeblockok")) lzc_flags |= LZC_SEND_FLAG_LARGE_BLOCK; if (nvlist_exists(resume_nvl, "embedok")) lzc_flags |= LZC_SEND_FLAG_EMBED_DATA; if (nvlist_exists(resume_nvl, "compressok")) lzc_flags |= LZC_SEND_FLAG_COMPRESS; if (nvlist_exists(resume_nvl, "rawok")) lzc_flags |= LZC_SEND_FLAG_RAW; if (nvlist_exists(resume_nvl, "savedok")) lzc_flags |= LZC_SEND_FLAG_SAVED; return (lzc_flags); } static int zfs_send_resume_impl_cb_impl(libzfs_handle_t *hdl, sendflags_t *flags, int outfd, nvlist_t *resume_nvl) { char errbuf[ERRBUFLEN]; char *toname; char *fromname = NULL; uint64_t resumeobj, resumeoff, toguid, fromguid, bytes; zfs_handle_t *zhp; int error = 0; char name[ZFS_MAX_DATASET_NAME_LEN]; FILE *fout = (flags->verbosity > 0 && flags->dryrun) ? stdout : stderr; uint64_t *redact_snap_guids = NULL; int num_redact_snaps = 0; char *redact_book = NULL; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot resume send")); if (flags->verbosity != 0) { (void) fprintf(fout, dgettext(TEXT_DOMAIN, "resume token contents:\n")); nvlist_print(fout, resume_nvl); } if (nvlist_lookup_string(resume_nvl, "toname", &toname) != 0 || nvlist_lookup_uint64(resume_nvl, "object", &resumeobj) != 0 || nvlist_lookup_uint64(resume_nvl, "offset", &resumeoff) != 0 || nvlist_lookup_uint64(resume_nvl, "bytes", &bytes) != 0 || nvlist_lookup_uint64(resume_nvl, "toguid", &toguid) != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "resume token is corrupt")); return (zfs_error(hdl, EZFS_FAULT, errbuf)); } fromguid = 0; (void) nvlist_lookup_uint64(resume_nvl, "fromguid", &fromguid); if (flags->saved) { (void) strlcpy(name, toname, sizeof (name)); } else { error = guid_to_name(hdl, toname, toguid, B_FALSE, name); if (error != 0) { if (zfs_dataset_exists(hdl, toname, ZFS_TYPE_DATASET)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' is no longer the same snapshot " "used in the initial send"), toname); } else { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' used in the initial send no " "longer exists"), toname); } return (zfs_error(hdl, EZFS_BADPATH, errbuf)); } } zhp = zfs_open(hdl, name, ZFS_TYPE_DATASET); if (zhp == NULL) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "unable to access '%s'"), name); return (zfs_error(hdl, EZFS_BADPATH, errbuf)); } if (nvlist_lookup_uint64_array(resume_nvl, "book_redact_snaps", &redact_snap_guids, (uint_t *)&num_redact_snaps) != 0) { num_redact_snaps = -1; } if (fromguid != 0) { if (guid_to_name_redact_snaps(hdl, toname, fromguid, B_TRUE, redact_snap_guids, num_redact_snaps, name) != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "incremental source %#llx no longer exists"), (longlong_t)fromguid); return (zfs_error(hdl, EZFS_BADPATH, errbuf)); } fromname = name; } redact_snap_guids = NULL; if (nvlist_lookup_uint64_array(resume_nvl, zfs_prop_to_name(ZFS_PROP_REDACT_SNAPS), &redact_snap_guids, (uint_t *)&num_redact_snaps) == 0) { char path[ZFS_MAX_DATASET_NAME_LEN]; (void) strlcpy(path, toname, sizeof (path)); char *at = strchr(path, '@'); ASSERT3P(at, !=, NULL); *at = '\0'; if ((error = find_redact_book(hdl, path, redact_snap_guids, num_redact_snaps, &redact_book)) != 0) { return (error); } } enum lzc_send_flags lzc_flags = lzc_flags_from_sendflags(flags) | lzc_flags_from_resume_nvl(resume_nvl); if (flags->verbosity != 0) { /* * Some of these may have come from the resume token, set them * here for size estimate purposes. */ sendflags_t tmpflags = *flags; if (lzc_flags & LZC_SEND_FLAG_LARGE_BLOCK) tmpflags.largeblock = B_TRUE; if (lzc_flags & LZC_SEND_FLAG_COMPRESS) tmpflags.compress = B_TRUE; if (lzc_flags & LZC_SEND_FLAG_EMBED_DATA) tmpflags.embed_data = B_TRUE; if (lzc_flags & LZC_SEND_FLAG_RAW) tmpflags.raw = B_TRUE; if (lzc_flags & LZC_SEND_FLAG_SAVED) tmpflags.saved = B_TRUE; error = estimate_size(zhp, fromname, outfd, &tmpflags, resumeobj, resumeoff, bytes, redact_book, errbuf); } if (!flags->dryrun) { progress_arg_t pa = { 0 }; pthread_t tid; /* * If progress reporting is requested, spawn a new thread to * poll ZFS_IOC_SEND_PROGRESS at a regular interval. */ if (flags->progress) { pa.pa_zhp = zhp; pa.pa_fd = outfd; pa.pa_parsable = flags->parsable; pa.pa_estimate = B_FALSE; pa.pa_verbosity = flags->verbosity; error = pthread_create(&tid, NULL, send_progress_thread, &pa); if (error != 0) { if (redact_book != NULL) free(redact_book); zfs_close(zhp); return (error); } } error = lzc_send_resume_redacted(zhp->zfs_name, fromname, outfd, lzc_flags, resumeobj, resumeoff, redact_book); if (redact_book != NULL) free(redact_book); if (flags->progress && send_progress_thread_exit(hdl, tid)) return (-1); char errbuf[ERRBUFLEN]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "warning: cannot send '%s'"), zhp->zfs_name); zfs_close(zhp); switch (error) { case 0: return (0); case EACCES: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "source key must be loaded")); return (zfs_error(hdl, EZFS_CRYPTOFAILED, errbuf)); case ESRCH: if (lzc_exists(zhp->zfs_name)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "incremental source could not be found")); } return (zfs_error(hdl, EZFS_NOENT, errbuf)); case EXDEV: case ENOENT: case EDQUOT: case EFBIG: case EIO: case ENOLINK: case ENOSPC: case ENOSTR: case ENXIO: case EPIPE: case ERANGE: case EFAULT: case EROFS: zfs_error_aux(hdl, "%s", strerror(errno)); return (zfs_error(hdl, EZFS_BADBACKUP, errbuf)); default: return (zfs_standard_error(hdl, errno, errbuf)); } } else { if (redact_book != NULL) free(redact_book); } zfs_close(zhp); return (error); } struct zfs_send_resume_impl { libzfs_handle_t *hdl; sendflags_t *flags; nvlist_t *resume_nvl; }; static int zfs_send_resume_impl_cb(int outfd, void *arg) { struct zfs_send_resume_impl *zsri = arg; return (zfs_send_resume_impl_cb_impl(zsri->hdl, zsri->flags, outfd, zsri->resume_nvl)); } static int zfs_send_resume_impl(libzfs_handle_t *hdl, sendflags_t *flags, int outfd, nvlist_t *resume_nvl) { struct zfs_send_resume_impl zsri = { .hdl = hdl, .flags = flags, .resume_nvl = resume_nvl, }; return (lzc_send_wrapper(zfs_send_resume_impl_cb, outfd, &zsri)); } int zfs_send_resume(libzfs_handle_t *hdl, sendflags_t *flags, int outfd, const char *resume_token) { int ret; char errbuf[ERRBUFLEN]; nvlist_t *resume_nvl; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot resume send")); resume_nvl = zfs_send_resume_token_to_nvlist(hdl, resume_token); if (resume_nvl == NULL) { /* * zfs_error_aux has already been set by * zfs_send_resume_token_to_nvlist() */ return (zfs_error(hdl, EZFS_FAULT, errbuf)); } ret = zfs_send_resume_impl(hdl, flags, outfd, resume_nvl); fnvlist_free(resume_nvl); return (ret); } int zfs_send_saved(zfs_handle_t *zhp, sendflags_t *flags, int outfd, const char *resume_token) { int ret; libzfs_handle_t *hdl = zhp->zfs_hdl; nvlist_t *saved_nvl = NULL, *resume_nvl = NULL; uint64_t saved_guid = 0, resume_guid = 0; uint64_t obj = 0, off = 0, bytes = 0; char token_buf[ZFS_MAXPROPLEN]; char errbuf[ERRBUFLEN]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "saved send failed")); ret = zfs_prop_get(zhp, ZFS_PROP_RECEIVE_RESUME_TOKEN, token_buf, sizeof (token_buf), NULL, NULL, 0, B_TRUE); if (ret != 0) goto out; saved_nvl = zfs_send_resume_token_to_nvlist(hdl, token_buf); if (saved_nvl == NULL) { /* * zfs_error_aux has already been set by * zfs_send_resume_token_to_nvlist() */ ret = zfs_error(hdl, EZFS_FAULT, errbuf); goto out; } /* * If a resume token is provided we use the object and offset * from that instead of the default, which starts from the * beginning. */ if (resume_token != NULL) { resume_nvl = zfs_send_resume_token_to_nvlist(hdl, resume_token); if (resume_nvl == NULL) { ret = zfs_error(hdl, EZFS_FAULT, errbuf); goto out; } if (nvlist_lookup_uint64(resume_nvl, "object", &obj) != 0 || nvlist_lookup_uint64(resume_nvl, "offset", &off) != 0 || nvlist_lookup_uint64(resume_nvl, "bytes", &bytes) != 0 || nvlist_lookup_uint64(resume_nvl, "toguid", &resume_guid) != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "provided resume token is corrupt")); ret = zfs_error(hdl, EZFS_FAULT, errbuf); goto out; } if (nvlist_lookup_uint64(saved_nvl, "toguid", &saved_guid)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "dataset's resume token is corrupt")); ret = zfs_error(hdl, EZFS_FAULT, errbuf); goto out; } if (resume_guid != saved_guid) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "provided resume token does not match dataset")); ret = zfs_error(hdl, EZFS_BADBACKUP, errbuf); goto out; } } (void) nvlist_remove_all(saved_nvl, "object"); fnvlist_add_uint64(saved_nvl, "object", obj); (void) nvlist_remove_all(saved_nvl, "offset"); fnvlist_add_uint64(saved_nvl, "offset", off); (void) nvlist_remove_all(saved_nvl, "bytes"); fnvlist_add_uint64(saved_nvl, "bytes", bytes); (void) nvlist_remove_all(saved_nvl, "toname"); fnvlist_add_string(saved_nvl, "toname", zhp->zfs_name); ret = zfs_send_resume_impl(hdl, flags, outfd, saved_nvl); out: fnvlist_free(saved_nvl); fnvlist_free(resume_nvl); return (ret); } /* * This function informs the target system that the recursive send is complete. * The record is also expected in the case of a send -p. */ static int send_conclusion_record(int fd, zio_cksum_t *zc) { dmu_replay_record_t drr = { 0 }; drr.drr_type = DRR_END; if (zc != NULL) drr.drr_u.drr_end.drr_checksum = *zc; if (write(fd, &drr, sizeof (drr)) == -1) { return (errno); } return (0); } /* * This function is responsible for sending the records that contain the * necessary information for the target system's libzfs to be able to set the * properties of the filesystem being received, or to be able to prepare for * a recursive receive. * * The "zhp" argument is the handle of the snapshot we are sending * (the "tosnap"). The "from" argument is the short snapshot name (the part * after the @) of the incremental source. */ static int send_prelim_records(zfs_handle_t *zhp, const char *from, int fd, boolean_t gather_props, boolean_t recursive, boolean_t verbose, boolean_t dryrun, boolean_t raw, boolean_t replicate, boolean_t skipmissing, boolean_t backup, boolean_t holds, boolean_t props, boolean_t doall, nvlist_t **fssp, avl_tree_t **fsavlp) { int err = 0; char *packbuf = NULL; size_t buflen = 0; zio_cksum_t zc = { {0} }; int featureflags = 0; /* name of filesystem/volume that contains snapshot we are sending */ char tofs[ZFS_MAX_DATASET_NAME_LEN]; /* short name of snap we are sending */ const char *tosnap = ""; char errbuf[ERRBUFLEN]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "warning: cannot send '%s'"), zhp->zfs_name); if (zhp->zfs_type == ZFS_TYPE_FILESYSTEM && zfs_prop_get_int(zhp, ZFS_PROP_VERSION) >= ZPL_VERSION_SA) { featureflags |= DMU_BACKUP_FEATURE_SA_SPILL; } if (holds) featureflags |= DMU_BACKUP_FEATURE_HOLDS; (void) strlcpy(tofs, zhp->zfs_name, ZFS_MAX_DATASET_NAME_LEN); char *at = strchr(tofs, '@'); if (at != NULL) { *at = '\0'; tosnap = at + 1; } if (gather_props) { nvlist_t *hdrnv = fnvlist_alloc(); nvlist_t *fss = NULL; if (from != NULL) fnvlist_add_string(hdrnv, "fromsnap", from); fnvlist_add_string(hdrnv, "tosnap", tosnap); if (!recursive) fnvlist_add_boolean(hdrnv, "not_recursive"); if (raw) { fnvlist_add_boolean(hdrnv, "raw"); } - if ((err = gather_nvlist(zhp->zfs_hdl, tofs, + if (gather_nvlist(zhp->zfs_hdl, tofs, from, tosnap, recursive, raw, doall, replicate, skipmissing, - verbose, backup, holds, props, &fss, fsavlp)) != 0) { + verbose, backup, holds, props, &fss, fsavlp) != 0) { return (zfs_error(zhp->zfs_hdl, EZFS_BADBACKUP, errbuf)); } /* * Do not allow the size of the properties list to exceed * the limit */ if ((fnvlist_size(fss) + fnvlist_size(hdrnv)) > zhp->zfs_hdl->libzfs_max_nvlist) { (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "warning: cannot send '%s': " "the size of the list of snapshots and properties " "is too large to be received successfully.\n" "Select a smaller number of snapshots to send.\n"), zhp->zfs_name); return (zfs_error(zhp->zfs_hdl, EZFS_NOSPC, errbuf)); } fnvlist_add_nvlist(hdrnv, "fss", fss); VERIFY0(nvlist_pack(hdrnv, &packbuf, &buflen, NV_ENCODE_XDR, 0)); if (fssp != NULL) { *fssp = fss; } else { fnvlist_free(fss); } fnvlist_free(hdrnv); } if (!dryrun) { dmu_replay_record_t drr = { 0 }; /* write first begin record */ drr.drr_type = DRR_BEGIN; drr.drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC; DMU_SET_STREAM_HDRTYPE(drr.drr_u.drr_begin. drr_versioninfo, DMU_COMPOUNDSTREAM); DMU_SET_FEATUREFLAGS(drr.drr_u.drr_begin. drr_versioninfo, featureflags); if (snprintf(drr.drr_u.drr_begin.drr_toname, sizeof (drr.drr_u.drr_begin.drr_toname), "%s@%s", tofs, tosnap) >= sizeof (drr.drr_u.drr_begin.drr_toname)) { return (zfs_error(zhp->zfs_hdl, EZFS_BADBACKUP, errbuf)); } drr.drr_payloadlen = buflen; err = dump_record(&drr, packbuf, buflen, &zc, fd); free(packbuf); if (err != 0) { zfs_error_aux(zhp->zfs_hdl, "%s", strerror(err)); return (zfs_error(zhp->zfs_hdl, EZFS_BADBACKUP, errbuf)); } err = send_conclusion_record(fd, &zc); if (err != 0) { zfs_error_aux(zhp->zfs_hdl, "%s", strerror(err)); return (zfs_error(zhp->zfs_hdl, EZFS_BADBACKUP, errbuf)); } } return (0); } /* * Generate a send stream. The "zhp" argument is the filesystem/volume * that contains the snapshot to send. The "fromsnap" argument is the * short name (the part after the '@') of the snapshot that is the * incremental source to send from (if non-NULL). The "tosnap" argument * is the short name of the snapshot to send. * * The content of the send stream is the snapshot identified by * 'tosnap'. Incremental streams are requested in two ways: * - from the snapshot identified by "fromsnap" (if non-null) or * - from the origin of the dataset identified by zhp, which must * be a clone. In this case, "fromsnap" is null and "fromorigin" * is TRUE. * * The send stream is recursive (i.e. dumps a hierarchy of snapshots) and * uses a special header (with a hdrtype field of DMU_COMPOUNDSTREAM) * if "replicate" is set. If "doall" is set, dump all the intermediate * snapshots. The DMU_COMPOUNDSTREAM header is used in the "doall" * case too. If "props" is set, send properties. * * Pre-wrapped (cf. lzc_send_wrapper()). */ static int zfs_send_cb_impl(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap, sendflags_t *flags, int outfd, snapfilter_cb_t filter_func, void *cb_arg, nvlist_t **debugnvp) { char errbuf[ERRBUFLEN]; send_dump_data_t sdd = { 0 }; int err = 0; nvlist_t *fss = NULL; avl_tree_t *fsavl = NULL; static uint64_t holdseq; int spa_version; FILE *fout; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot send '%s'"), zhp->zfs_name); if (fromsnap && fromsnap[0] == '\0') { zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, "zero-length incremental source")); return (zfs_error(zhp->zfs_hdl, EZFS_NOENT, errbuf)); } if (fromsnap) { char full_fromsnap_name[ZFS_MAX_DATASET_NAME_LEN]; if (snprintf(full_fromsnap_name, sizeof (full_fromsnap_name), "%s@%s", zhp->zfs_name, fromsnap) >= sizeof (full_fromsnap_name)) { err = EINVAL; goto stderr_out; } zfs_handle_t *fromsnapn = zfs_open(zhp->zfs_hdl, full_fromsnap_name, ZFS_TYPE_SNAPSHOT); if (fromsnapn == NULL) { err = -1; goto err_out; } zfs_close(fromsnapn); } if (flags->replicate || flags->doall || flags->props || flags->holds || flags->backup) { char full_tosnap_name[ZFS_MAX_DATASET_NAME_LEN]; if (snprintf(full_tosnap_name, sizeof (full_tosnap_name), "%s@%s", zhp->zfs_name, tosnap) >= sizeof (full_tosnap_name)) { err = EINVAL; goto stderr_out; } zfs_handle_t *tosnap = zfs_open(zhp->zfs_hdl, full_tosnap_name, ZFS_TYPE_SNAPSHOT); if (tosnap == NULL) { err = -1; goto err_out; } err = send_prelim_records(tosnap, fromsnap, outfd, flags->replicate || flags->props || flags->holds, flags->replicate, flags->verbosity > 0, flags->dryrun, flags->raw, flags->replicate, flags->skipmissing, flags->backup, flags->holds, flags->props, flags->doall, &fss, &fsavl); zfs_close(tosnap); if (err != 0) goto err_out; } /* dump each stream */ sdd.fromsnap = fromsnap; sdd.tosnap = tosnap; sdd.outfd = outfd; sdd.replicate = flags->replicate; sdd.doall = flags->doall; sdd.fromorigin = flags->fromorigin; sdd.fss = fss; sdd.fsavl = fsavl; sdd.verbosity = flags->verbosity; sdd.parsable = flags->parsable; sdd.progress = flags->progress; sdd.dryrun = flags->dryrun; sdd.large_block = flags->largeblock; sdd.embed_data = flags->embed_data; sdd.compress = flags->compress; sdd.raw = flags->raw; sdd.holds = flags->holds; sdd.filter_cb = filter_func; sdd.filter_cb_arg = cb_arg; if (debugnvp) sdd.debugnv = *debugnvp; if (sdd.verbosity != 0 && sdd.dryrun) sdd.std_out = B_TRUE; fout = sdd.std_out ? stdout : stderr; /* * Some flags require that we place user holds on the datasets that are * being sent so they don't get destroyed during the send. We can skip * this step if the pool is imported read-only since the datasets cannot * be destroyed. */ if (!flags->dryrun && !zpool_get_prop_int(zfs_get_pool_handle(zhp), ZPOOL_PROP_READONLY, NULL) && zfs_spa_version(zhp, &spa_version) == 0 && spa_version >= SPA_VERSION_USERREFS && (flags->doall || flags->replicate)) { ++holdseq; (void) snprintf(sdd.holdtag, sizeof (sdd.holdtag), ".send-%d-%llu", getpid(), (u_longlong_t)holdseq); sdd.cleanup_fd = open(ZFS_DEV, O_RDWR | O_CLOEXEC); if (sdd.cleanup_fd < 0) { err = errno; goto stderr_out; } sdd.snapholds = fnvlist_alloc(); } else { sdd.cleanup_fd = -1; sdd.snapholds = NULL; } if (flags->verbosity != 0 || sdd.snapholds != NULL) { /* * Do a verbose no-op dry run to get all the verbose output * or to gather snapshot hold's before generating any data, * then do a non-verbose real run to generate the streams. */ sdd.dryrun = B_TRUE; err = dump_filesystems(zhp, &sdd); if (err != 0) goto stderr_out; if (flags->verbosity != 0) { if (flags->parsable) { (void) fprintf(fout, "size\t%llu\n", (longlong_t)sdd.size); } else { char buf[16]; zfs_nicebytes(sdd.size, buf, sizeof (buf)); (void) fprintf(fout, dgettext(TEXT_DOMAIN, "total estimated size is %s\n"), buf); } } /* Ensure no snaps found is treated as an error. */ if (!sdd.seento) { err = ENOENT; goto err_out; } /* Skip the second run if dryrun was requested. */ if (flags->dryrun) goto err_out; if (sdd.snapholds != NULL) { err = zfs_hold_nvl(zhp, sdd.cleanup_fd, sdd.snapholds); if (err != 0) goto stderr_out; fnvlist_free(sdd.snapholds); sdd.snapholds = NULL; } sdd.dryrun = B_FALSE; sdd.verbosity = 0; } err = dump_filesystems(zhp, &sdd); fsavl_destroy(fsavl); fnvlist_free(fss); /* Ensure no snaps found is treated as an error. */ if (err == 0 && !sdd.seento) err = ENOENT; if (sdd.cleanup_fd != -1) { VERIFY(0 == close(sdd.cleanup_fd)); sdd.cleanup_fd = -1; } if (!flags->dryrun && (flags->replicate || flags->doall || flags->props || flags->backup || flags->holds)) { /* * write final end record. NB: want to do this even if * there was some error, because it might not be totally * failed. */ int err2 = send_conclusion_record(outfd, NULL); if (err2 != 0) return (zfs_standard_error(zhp->zfs_hdl, err2, errbuf)); } return (err || sdd.err); stderr_out: err = zfs_standard_error(zhp->zfs_hdl, err, errbuf); err_out: fsavl_destroy(fsavl); fnvlist_free(fss); fnvlist_free(sdd.snapholds); if (sdd.cleanup_fd != -1) VERIFY(0 == close(sdd.cleanup_fd)); return (err); } struct zfs_send { zfs_handle_t *zhp; const char *fromsnap; const char *tosnap; sendflags_t *flags; snapfilter_cb_t *filter_func; void *cb_arg; nvlist_t **debugnvp; }; static int zfs_send_cb(int outfd, void *arg) { struct zfs_send *zs = arg; return (zfs_send_cb_impl(zs->zhp, zs->fromsnap, zs->tosnap, zs->flags, outfd, zs->filter_func, zs->cb_arg, zs->debugnvp)); } int zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap, sendflags_t *flags, int outfd, snapfilter_cb_t filter_func, void *cb_arg, nvlist_t **debugnvp) { struct zfs_send arg = { .zhp = zhp, .fromsnap = fromsnap, .tosnap = tosnap, .flags = flags, .filter_func = filter_func, .cb_arg = cb_arg, .debugnvp = debugnvp, }; return (lzc_send_wrapper(zfs_send_cb, outfd, &arg)); } static zfs_handle_t * name_to_dir_handle(libzfs_handle_t *hdl, const char *snapname) { char dirname[ZFS_MAX_DATASET_NAME_LEN]; (void) strlcpy(dirname, snapname, ZFS_MAX_DATASET_NAME_LEN); char *c = strchr(dirname, '@'); if (c != NULL) *c = '\0'; return (zfs_open(hdl, dirname, ZFS_TYPE_DATASET)); } /* * Returns B_TRUE if earlier is an earlier snapshot in later's timeline; either * an earlier snapshot in the same filesystem, or a snapshot before later's * origin, or it's origin's origin, etc. */ static boolean_t snapshot_is_before(zfs_handle_t *earlier, zfs_handle_t *later) { boolean_t ret; uint64_t later_txg = (later->zfs_type == ZFS_TYPE_FILESYSTEM || later->zfs_type == ZFS_TYPE_VOLUME ? UINT64_MAX : zfs_prop_get_int(later, ZFS_PROP_CREATETXG)); uint64_t earlier_txg = zfs_prop_get_int(earlier, ZFS_PROP_CREATETXG); if (earlier_txg >= later_txg) return (B_FALSE); zfs_handle_t *earlier_dir = name_to_dir_handle(earlier->zfs_hdl, earlier->zfs_name); zfs_handle_t *later_dir = name_to_dir_handle(later->zfs_hdl, later->zfs_name); if (strcmp(earlier_dir->zfs_name, later_dir->zfs_name) == 0) { zfs_close(earlier_dir); zfs_close(later_dir); return (B_TRUE); } char clonename[ZFS_MAX_DATASET_NAME_LEN]; if (zfs_prop_get(later_dir, ZFS_PROP_ORIGIN, clonename, ZFS_MAX_DATASET_NAME_LEN, NULL, NULL, 0, B_TRUE) != 0) { zfs_close(earlier_dir); zfs_close(later_dir); return (B_FALSE); } zfs_handle_t *origin = zfs_open(earlier->zfs_hdl, clonename, ZFS_TYPE_DATASET); uint64_t origin_txg = zfs_prop_get_int(origin, ZFS_PROP_CREATETXG); /* * If "earlier" is exactly the origin, then * snapshot_is_before(earlier, origin) will return false (because * they're the same). */ if (origin_txg == earlier_txg && strcmp(origin->zfs_name, earlier->zfs_name) == 0) { zfs_close(earlier_dir); zfs_close(later_dir); zfs_close(origin); return (B_TRUE); } zfs_close(earlier_dir); zfs_close(later_dir); ret = snapshot_is_before(earlier, origin); zfs_close(origin); return (ret); } /* * The "zhp" argument is the handle of the dataset to send (typically a * snapshot). The "from" argument is the full name of the snapshot or * bookmark that is the incremental source. * * Pre-wrapped (cf. lzc_send_wrapper()). */ static int zfs_send_one_cb_impl(zfs_handle_t *zhp, const char *from, int fd, sendflags_t *flags, const char *redactbook) { int err; libzfs_handle_t *hdl = zhp->zfs_hdl; char *name = zhp->zfs_name; pthread_t ptid; progress_arg_t pa = { 0 }; char errbuf[ERRBUFLEN]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "warning: cannot send '%s'"), name); if (from != NULL && strchr(from, '@')) { zfs_handle_t *from_zhp = zfs_open(hdl, from, ZFS_TYPE_DATASET); if (from_zhp == NULL) return (-1); if (!snapshot_is_before(from_zhp, zhp)) { zfs_close(from_zhp); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "not an earlier snapshot from the same fs")); return (zfs_error(hdl, EZFS_CROSSTARGET, errbuf)); } zfs_close(from_zhp); } if (redactbook != NULL) { char bookname[ZFS_MAX_DATASET_NAME_LEN]; nvlist_t *redact_snaps; zfs_handle_t *book_zhp; char *at, *pound; int dsnamelen; pound = strchr(redactbook, '#'); if (pound != NULL) redactbook = pound + 1; at = strchr(name, '@'); if (at == NULL) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot do a redacted send to a filesystem")); return (zfs_error(hdl, EZFS_BADTYPE, errbuf)); } dsnamelen = at - name; if (snprintf(bookname, sizeof (bookname), "%.*s#%s", dsnamelen, name, redactbook) >= sizeof (bookname)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid bookmark name")); return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); } book_zhp = zfs_open(hdl, bookname, ZFS_TYPE_BOOKMARK); if (book_zhp == NULL) return (-1); if (nvlist_lookup_nvlist(book_zhp->zfs_props, zfs_prop_to_name(ZFS_PROP_REDACT_SNAPS), &redact_snaps) != 0 || redact_snaps == NULL) { zfs_close(book_zhp); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "not a redaction bookmark")); return (zfs_error(hdl, EZFS_BADTYPE, errbuf)); } zfs_close(book_zhp); } /* * Send fs properties */ if (flags->props || flags->holds || flags->backup) { /* * Note: the header generated by send_prelim_records() * assumes that the incremental source is in the same * filesystem/volume as the target (which is a requirement * when doing "zfs send -R"). But that isn't always the * case here (e.g. send from snap in origin, or send from * bookmark). We pass from=NULL, which will omit this * information from the prelim records; it isn't used * when receiving this type of stream. */ err = send_prelim_records(zhp, NULL, fd, B_TRUE, B_FALSE, flags->verbosity > 0, flags->dryrun, flags->raw, flags->replicate, B_FALSE, flags->backup, flags->holds, flags->props, flags->doall, NULL, NULL); if (err != 0) return (err); } /* * Perform size estimate if verbose was specified. */ if (flags->verbosity != 0) { err = estimate_size(zhp, from, fd, flags, 0, 0, 0, redactbook, errbuf); if (err != 0) return (err); } if (flags->dryrun) return (0); /* * If progress reporting is requested, spawn a new thread to poll * ZFS_IOC_SEND_PROGRESS at a regular interval. */ if (flags->progress) { pa.pa_zhp = zhp; pa.pa_fd = fd; pa.pa_parsable = flags->parsable; pa.pa_estimate = B_FALSE; pa.pa_verbosity = flags->verbosity; err = pthread_create(&ptid, NULL, send_progress_thread, &pa); if (err != 0) { zfs_error_aux(zhp->zfs_hdl, "%s", strerror(errno)); return (zfs_error(zhp->zfs_hdl, EZFS_THREADCREATEFAILED, errbuf)); } } err = lzc_send_redacted(name, from, fd, lzc_flags_from_sendflags(flags), redactbook); if (flags->progress && send_progress_thread_exit(hdl, ptid)) return (-1); if (err == 0 && (flags->props || flags->holds || flags->backup)) { /* Write the final end record. */ err = send_conclusion_record(fd, NULL); if (err != 0) return (zfs_standard_error(hdl, err, errbuf)); } if (err != 0) { switch (errno) { case EXDEV: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "not an earlier snapshot from the same fs")); return (zfs_error(hdl, EZFS_CROSSTARGET, errbuf)); case ENOENT: case ESRCH: if (lzc_exists(name)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "incremental source (%s) does not exist"), from); } return (zfs_error(hdl, EZFS_NOENT, errbuf)); case EACCES: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "dataset key must be loaded")); return (zfs_error(hdl, EZFS_CRYPTOFAILED, errbuf)); case EBUSY: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "target is busy; if a filesystem, " "it must not be mounted")); return (zfs_error(hdl, EZFS_BUSY, errbuf)); case EDQUOT: case EFAULT: case EFBIG: case EINVAL: case EIO: case ENOLINK: case ENOSPC: case ENOSTR: case ENXIO: case EPIPE: case ERANGE: case EROFS: zfs_error_aux(hdl, "%s", strerror(errno)); return (zfs_error(hdl, EZFS_BADBACKUP, errbuf)); default: return (zfs_standard_error(hdl, errno, errbuf)); } } return (err != 0); } struct zfs_send_one { zfs_handle_t *zhp; const char *from; sendflags_t *flags; const char *redactbook; }; static int zfs_send_one_cb(int fd, void *arg) { struct zfs_send_one *zso = arg; return (zfs_send_one_cb_impl(zso->zhp, zso->from, fd, zso->flags, zso->redactbook)); } int zfs_send_one(zfs_handle_t *zhp, const char *from, int fd, sendflags_t *flags, const char *redactbook) { struct zfs_send_one zso = { .zhp = zhp, .from = from, .flags = flags, .redactbook = redactbook, }; return (lzc_send_wrapper(zfs_send_one_cb, fd, &zso)); } /* * Routines specific to "zfs recv" */ static int recv_read(libzfs_handle_t *hdl, int fd, void *buf, int ilen, boolean_t byteswap, zio_cksum_t *zc) { char *cp = buf; int rv; int len = ilen; do { rv = read(fd, cp, len); cp += rv; len -= rv; } while (rv > 0); if (rv < 0 || len != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "failed to read from stream")); return (zfs_error(hdl, EZFS_BADSTREAM, dgettext(TEXT_DOMAIN, "cannot receive"))); } if (zc) { if (byteswap) fletcher_4_incremental_byteswap(buf, ilen, zc); else fletcher_4_incremental_native(buf, ilen, zc); } return (0); } static int recv_read_nvlist(libzfs_handle_t *hdl, int fd, int len, nvlist_t **nvp, boolean_t byteswap, zio_cksum_t *zc) { char *buf; int err; buf = zfs_alloc(hdl, len); if (len > hdl->libzfs_max_nvlist) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "nvlist too large")); free(buf); return (ENOMEM); } err = recv_read(hdl, fd, buf, len, byteswap, zc); if (err != 0) { free(buf); return (err); } err = nvlist_unpack(buf, len, nvp, 0); free(buf); if (err != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid " "stream (malformed nvlist)")); return (EINVAL); } return (0); } /* * Returns the grand origin (origin of origin of origin...) of a given handle. * If this dataset is not a clone, it simply returns a copy of the original * handle. */ static zfs_handle_t * recv_open_grand_origin(zfs_handle_t *zhp) { char origin[ZFS_MAX_DATASET_NAME_LEN]; zprop_source_t src; zfs_handle_t *ozhp = zfs_handle_dup(zhp); while (ozhp != NULL) { if (zfs_prop_get(ozhp, ZFS_PROP_ORIGIN, origin, sizeof (origin), &src, NULL, 0, B_FALSE) != 0) break; (void) zfs_close(ozhp); ozhp = zfs_open(zhp->zfs_hdl, origin, ZFS_TYPE_FILESYSTEM); } return (ozhp); } static int recv_rename_impl(zfs_handle_t *zhp, const char *name, const char *newname) { int err; zfs_handle_t *ozhp = NULL; /* * Attempt to rename the dataset. If it fails with EACCES we have * attempted to rename the dataset outside of its encryption root. * Force the dataset to become an encryption root and try again. */ err = lzc_rename(name, newname); if (err == EACCES) { ozhp = recv_open_grand_origin(zhp); if (ozhp == NULL) { err = ENOENT; goto out; } err = lzc_change_key(ozhp->zfs_name, DCP_CMD_FORCE_NEW_KEY, NULL, NULL, 0); if (err != 0) goto out; err = lzc_rename(name, newname); } out: if (ozhp != NULL) zfs_close(ozhp); return (err); } static int recv_rename(libzfs_handle_t *hdl, const char *name, const char *tryname, int baselen, char *newname, recvflags_t *flags) { static int seq; int err; prop_changelist_t *clp = NULL; zfs_handle_t *zhp = NULL; zhp = zfs_open(hdl, name, ZFS_TYPE_DATASET); if (zhp == NULL) { err = -1; goto out; } clp = changelist_gather(zhp, ZFS_PROP_NAME, 0, flags->force ? MS_FORCE : 0); if (clp == NULL) { err = -1; goto out; } err = changelist_prefix(clp); if (err) goto out; if (tryname) { (void) strlcpy(newname, tryname, ZFS_MAX_DATASET_NAME_LEN); if (flags->verbose) { (void) printf("attempting rename %s to %s\n", name, newname); } err = recv_rename_impl(zhp, name, newname); if (err == 0) changelist_rename(clp, name, tryname); } else { err = ENOENT; } if (err != 0 && strncmp(name + baselen, "recv-", 5) != 0) { seq++; (void) snprintf(newname, ZFS_MAX_DATASET_NAME_LEN, "%.*srecv-%u-%u", baselen, name, getpid(), seq); if (flags->verbose) { (void) printf("failed - trying rename %s to %s\n", name, newname); } err = recv_rename_impl(zhp, name, newname); if (err == 0) changelist_rename(clp, name, newname); if (err && flags->verbose) { (void) printf("failed (%u) - " "will try again on next pass\n", errno); } err = EAGAIN; } else if (flags->verbose) { if (err == 0) (void) printf("success\n"); else (void) printf("failed (%u)\n", errno); } (void) changelist_postfix(clp); out: if (clp != NULL) changelist_free(clp); if (zhp != NULL) zfs_close(zhp); return (err); } static int recv_promote(libzfs_handle_t *hdl, const char *fsname, const char *origin_fsname, recvflags_t *flags) { int err; zfs_cmd_t zc = {"\0"}; zfs_handle_t *zhp = NULL, *ozhp = NULL; if (flags->verbose) (void) printf("promoting %s\n", fsname); (void) strlcpy(zc.zc_value, origin_fsname, sizeof (zc.zc_value)); (void) strlcpy(zc.zc_name, fsname, sizeof (zc.zc_name)); /* * Attempt to promote the dataset. If it fails with EACCES the * promotion would cause this dataset to leave its encryption root. * Force the origin to become an encryption root and try again. */ err = zfs_ioctl(hdl, ZFS_IOC_PROMOTE, &zc); if (err == EACCES) { zhp = zfs_open(hdl, fsname, ZFS_TYPE_DATASET); if (zhp == NULL) { err = -1; goto out; } ozhp = recv_open_grand_origin(zhp); if (ozhp == NULL) { err = -1; goto out; } err = lzc_change_key(ozhp->zfs_name, DCP_CMD_FORCE_NEW_KEY, NULL, NULL, 0); if (err != 0) goto out; err = zfs_ioctl(hdl, ZFS_IOC_PROMOTE, &zc); } out: if (zhp != NULL) zfs_close(zhp); if (ozhp != NULL) zfs_close(ozhp); return (err); } static int recv_destroy(libzfs_handle_t *hdl, const char *name, int baselen, char *newname, recvflags_t *flags) { int err = 0; prop_changelist_t *clp; zfs_handle_t *zhp; boolean_t defer = B_FALSE; int spa_version; zhp = zfs_open(hdl, name, ZFS_TYPE_DATASET); if (zhp == NULL) return (-1); zfs_type_t type = zfs_get_type(zhp); if (type == ZFS_TYPE_SNAPSHOT && zfs_spa_version(zhp, &spa_version) == 0 && spa_version >= SPA_VERSION_USERREFS) defer = B_TRUE; clp = changelist_gather(zhp, ZFS_PROP_NAME, 0, flags->force ? MS_FORCE : 0); zfs_close(zhp); if (clp == NULL) return (-1); err = changelist_prefix(clp); if (err) return (err); if (flags->verbose) (void) printf("attempting destroy %s\n", name); if (type == ZFS_TYPE_SNAPSHOT) { nvlist_t *nv = fnvlist_alloc(); fnvlist_add_boolean(nv, name); err = lzc_destroy_snaps(nv, defer, NULL); fnvlist_free(nv); } else { err = lzc_destroy(name); } if (err == 0) { if (flags->verbose) (void) printf("success\n"); changelist_remove(clp, name); } (void) changelist_postfix(clp); changelist_free(clp); /* * Deferred destroy might destroy the snapshot or only mark it to be * destroyed later, and it returns success in either case. */ if (err != 0 || (defer && zfs_dataset_exists(hdl, name, ZFS_TYPE_SNAPSHOT))) { err = recv_rename(hdl, name, NULL, baselen, newname, flags); } return (err); } typedef struct guid_to_name_data { uint64_t guid; boolean_t bookmark_ok; char *name; char *skip; uint64_t *redact_snap_guids; uint64_t num_redact_snaps; } guid_to_name_data_t; static boolean_t redact_snaps_match(zfs_handle_t *zhp, guid_to_name_data_t *gtnd) { uint64_t *bmark_snaps; uint_t bmark_num_snaps; nvlist_t *nvl; if (zhp->zfs_type != ZFS_TYPE_BOOKMARK) return (B_FALSE); nvl = fnvlist_lookup_nvlist(zhp->zfs_props, zfs_prop_to_name(ZFS_PROP_REDACT_SNAPS)); bmark_snaps = fnvlist_lookup_uint64_array(nvl, ZPROP_VALUE, &bmark_num_snaps); if (bmark_num_snaps != gtnd->num_redact_snaps) return (B_FALSE); int i = 0; for (; i < bmark_num_snaps; i++) { int j = 0; for (; j < bmark_num_snaps; j++) { if (bmark_snaps[i] == gtnd->redact_snap_guids[j]) break; } if (j == bmark_num_snaps) break; } return (i == bmark_num_snaps); } static int guid_to_name_cb(zfs_handle_t *zhp, void *arg) { guid_to_name_data_t *gtnd = arg; const char *slash; int err; if (gtnd->skip != NULL && (slash = strrchr(zhp->zfs_name, '/')) != NULL && strcmp(slash + 1, gtnd->skip) == 0) { zfs_close(zhp); return (0); } if (zfs_prop_get_int(zhp, ZFS_PROP_GUID) == gtnd->guid && (gtnd->num_redact_snaps == -1 || redact_snaps_match(zhp, gtnd))) { (void) strcpy(gtnd->name, zhp->zfs_name); zfs_close(zhp); return (EEXIST); } err = zfs_iter_children(zhp, guid_to_name_cb, gtnd); if (err != EEXIST && gtnd->bookmark_ok) err = zfs_iter_bookmarks(zhp, guid_to_name_cb, gtnd); zfs_close(zhp); return (err); } /* * Attempt to find the local dataset associated with this guid. In the case of * multiple matches, we attempt to find the "best" match by searching * progressively larger portions of the hierarchy. This allows one to send a * tree of datasets individually and guarantee that we will find the source * guid within that hierarchy, even if there are multiple matches elsewhere. * * If num_redact_snaps is not -1, we attempt to find a redaction bookmark with * the specified number of redaction snapshots. If num_redact_snaps isn't 0 or * -1, then redact_snap_guids will be an array of the guids of the snapshots the * redaction bookmark was created with. If num_redact_snaps is -1, then we will * attempt to find a snapshot or bookmark (if bookmark_ok is passed) with the * given guid. Note that a redaction bookmark can be returned if * num_redact_snaps == -1. */ static int guid_to_name_redact_snaps(libzfs_handle_t *hdl, const char *parent, uint64_t guid, boolean_t bookmark_ok, uint64_t *redact_snap_guids, uint64_t num_redact_snaps, char *name) { char pname[ZFS_MAX_DATASET_NAME_LEN]; guid_to_name_data_t gtnd; gtnd.guid = guid; gtnd.bookmark_ok = bookmark_ok; gtnd.name = name; gtnd.skip = NULL; gtnd.redact_snap_guids = redact_snap_guids; gtnd.num_redact_snaps = num_redact_snaps; /* * Search progressively larger portions of the hierarchy, starting * with the filesystem specified by 'parent'. This will * select the "most local" version of the origin snapshot in the case * that there are multiple matching snapshots in the system. */ (void) strlcpy(pname, parent, sizeof (pname)); char *cp = strrchr(pname, '@'); if (cp == NULL) cp = strchr(pname, '\0'); for (; cp != NULL; cp = strrchr(pname, '/')) { /* Chop off the last component and open the parent */ *cp = '\0'; zfs_handle_t *zhp = make_dataset_handle(hdl, pname); if (zhp == NULL) continue; int err = guid_to_name_cb(zfs_handle_dup(zhp), >nd); if (err != EEXIST) err = zfs_iter_children(zhp, guid_to_name_cb, >nd); if (err != EEXIST && bookmark_ok) err = zfs_iter_bookmarks(zhp, guid_to_name_cb, >nd); zfs_close(zhp); if (err == EEXIST) return (0); /* * Remember the last portion of the dataset so we skip it next * time through (as we've already searched that portion of the * hierarchy). */ gtnd.skip = strrchr(pname, '/') + 1; } return (ENOENT); } static int guid_to_name(libzfs_handle_t *hdl, const char *parent, uint64_t guid, boolean_t bookmark_ok, char *name) { return (guid_to_name_redact_snaps(hdl, parent, guid, bookmark_ok, NULL, -1, name)); } /* * Return +1 if guid1 is before guid2, 0 if they are the same, and -1 if * guid1 is after guid2. */ static int created_before(libzfs_handle_t *hdl, avl_tree_t *avl, uint64_t guid1, uint64_t guid2) { nvlist_t *nvfs; char *fsname = NULL, *snapname = NULL; char buf[ZFS_MAX_DATASET_NAME_LEN]; int rv; zfs_handle_t *guid1hdl, *guid2hdl; uint64_t create1, create2; if (guid2 == 0) return (0); if (guid1 == 0) return (1); nvfs = fsavl_find(avl, guid1, &snapname); fsname = fnvlist_lookup_string(nvfs, "name"); (void) snprintf(buf, sizeof (buf), "%s@%s", fsname, snapname); guid1hdl = zfs_open(hdl, buf, ZFS_TYPE_SNAPSHOT); if (guid1hdl == NULL) return (-1); nvfs = fsavl_find(avl, guid2, &snapname); fsname = fnvlist_lookup_string(nvfs, "name"); (void) snprintf(buf, sizeof (buf), "%s@%s", fsname, snapname); guid2hdl = zfs_open(hdl, buf, ZFS_TYPE_SNAPSHOT); if (guid2hdl == NULL) { zfs_close(guid1hdl); return (-1); } create1 = zfs_prop_get_int(guid1hdl, ZFS_PROP_CREATETXG); create2 = zfs_prop_get_int(guid2hdl, ZFS_PROP_CREATETXG); if (create1 < create2) rv = -1; else if (create1 > create2) rv = +1; else rv = 0; zfs_close(guid1hdl); zfs_close(guid2hdl); return (rv); } /* * This function reestablishes the hierarchy of encryption roots after a * recursive incremental receive has completed. This must be done after the * second call to recv_incremental_replication() has renamed and promoted all * sent datasets to their final locations in the dataset hierarchy. */ static int recv_fix_encryption_hierarchy(libzfs_handle_t *hdl, const char *top_zfs, nvlist_t *stream_nv) { int err; nvpair_t *fselem = NULL; nvlist_t *stream_fss; stream_fss = fnvlist_lookup_nvlist(stream_nv, "fss"); while ((fselem = nvlist_next_nvpair(stream_fss, fselem)) != NULL) { zfs_handle_t *zhp = NULL; uint64_t crypt; nvlist_t *snaps, *props, *stream_nvfs = NULL; nvpair_t *snapel = NULL; boolean_t is_encroot, is_clone, stream_encroot; char *cp; char *stream_keylocation = NULL; char keylocation[MAXNAMELEN]; char fsname[ZFS_MAX_DATASET_NAME_LEN]; keylocation[0] = '\0'; stream_nvfs = fnvpair_value_nvlist(fselem); snaps = fnvlist_lookup_nvlist(stream_nvfs, "snaps"); props = fnvlist_lookup_nvlist(stream_nvfs, "props"); stream_encroot = nvlist_exists(stream_nvfs, "is_encroot"); /* find a snapshot from the stream that exists locally */ err = ENOENT; while ((snapel = nvlist_next_nvpair(snaps, snapel)) != NULL) { uint64_t guid; guid = fnvpair_value_uint64(snapel); err = guid_to_name(hdl, top_zfs, guid, B_FALSE, fsname); if (err == 0) break; } if (err != 0) continue; cp = strchr(fsname, '@'); if (cp != NULL) *cp = '\0'; zhp = zfs_open(hdl, fsname, ZFS_TYPE_DATASET); if (zhp == NULL) { err = ENOENT; goto error; } crypt = zfs_prop_get_int(zhp, ZFS_PROP_ENCRYPTION); is_clone = zhp->zfs_dmustats.dds_origin[0] != '\0'; (void) zfs_crypto_get_encryption_root(zhp, &is_encroot, NULL); /* we don't need to do anything for unencrypted datasets */ if (crypt == ZIO_CRYPT_OFF) { zfs_close(zhp); continue; } /* * If the dataset is flagged as an encryption root, was not * received as a clone and is not currently an encryption root, * force it to become one. Fixup the keylocation if necessary. */ if (stream_encroot) { if (!is_clone && !is_encroot) { err = lzc_change_key(fsname, DCP_CMD_FORCE_NEW_KEY, NULL, NULL, 0); if (err != 0) { zfs_close(zhp); goto error; } } stream_keylocation = fnvlist_lookup_string(props, zfs_prop_to_name(ZFS_PROP_KEYLOCATION)); /* * Refresh the properties in case the call to * lzc_change_key() changed the value. */ zfs_refresh_properties(zhp); err = zfs_prop_get(zhp, ZFS_PROP_KEYLOCATION, keylocation, sizeof (keylocation), NULL, NULL, 0, B_TRUE); if (err != 0) { zfs_close(zhp); goto error; } if (strcmp(keylocation, stream_keylocation) != 0) { err = zfs_prop_set(zhp, zfs_prop_to_name(ZFS_PROP_KEYLOCATION), stream_keylocation); if (err != 0) { zfs_close(zhp); goto error; } } } /* * If the dataset is not flagged as an encryption root and is * currently an encryption root, force it to inherit from its * parent. The root of a raw send should never be * force-inherited. */ if (!stream_encroot && is_encroot && strcmp(top_zfs, fsname) != 0) { err = lzc_change_key(fsname, DCP_CMD_FORCE_INHERIT, NULL, NULL, 0); if (err != 0) { zfs_close(zhp); goto error; } } zfs_close(zhp); } return (0); error: return (err); } static int recv_incremental_replication(libzfs_handle_t *hdl, const char *tofs, recvflags_t *flags, nvlist_t *stream_nv, avl_tree_t *stream_avl, nvlist_t *renamed) { nvlist_t *local_nv, *deleted = NULL; avl_tree_t *local_avl; nvpair_t *fselem, *nextfselem; char *fromsnap; char newname[ZFS_MAX_DATASET_NAME_LEN]; char guidname[32]; int error; boolean_t needagain, progress, recursive; char *s1, *s2; fromsnap = fnvlist_lookup_string(stream_nv, "fromsnap"); recursive = (nvlist_lookup_boolean(stream_nv, "not_recursive") == ENOENT); if (flags->dryrun) return (0); again: needagain = progress = B_FALSE; deleted = fnvlist_alloc(); if ((error = gather_nvlist(hdl, tofs, fromsnap, NULL, recursive, B_TRUE, B_FALSE, recursive, B_FALSE, B_FALSE, B_FALSE, B_FALSE, B_TRUE, &local_nv, &local_avl)) != 0) return (error); /* * Process deletes and renames */ for (fselem = nvlist_next_nvpair(local_nv, NULL); fselem; fselem = nextfselem) { nvlist_t *nvfs, *snaps; nvlist_t *stream_nvfs = NULL; nvpair_t *snapelem, *nextsnapelem; uint64_t fromguid = 0; uint64_t originguid = 0; uint64_t stream_originguid = 0; uint64_t parent_fromsnap_guid, stream_parent_fromsnap_guid; char *fsname, *stream_fsname; nextfselem = nvlist_next_nvpair(local_nv, fselem); nvfs = fnvpair_value_nvlist(fselem); snaps = fnvlist_lookup_nvlist(nvfs, "snaps"); fsname = fnvlist_lookup_string(nvfs, "name"); parent_fromsnap_guid = fnvlist_lookup_uint64(nvfs, "parentfromsnap"); (void) nvlist_lookup_uint64(nvfs, "origin", &originguid); /* * First find the stream's fs, so we can check for * a different origin (due to "zfs promote") */ for (snapelem = nvlist_next_nvpair(snaps, NULL); snapelem; snapelem = nvlist_next_nvpair(snaps, snapelem)) { uint64_t thisguid; thisguid = fnvpair_value_uint64(snapelem); stream_nvfs = fsavl_find(stream_avl, thisguid, NULL); if (stream_nvfs != NULL) break; } /* check for promote */ (void) nvlist_lookup_uint64(stream_nvfs, "origin", &stream_originguid); if (stream_nvfs && originguid != stream_originguid) { switch (created_before(hdl, local_avl, stream_originguid, originguid)) { case 1: { /* promote it! */ nvlist_t *origin_nvfs; char *origin_fsname; origin_nvfs = fsavl_find(local_avl, originguid, NULL); origin_fsname = fnvlist_lookup_string( origin_nvfs, "name"); error = recv_promote(hdl, fsname, origin_fsname, flags); if (error == 0) progress = B_TRUE; break; } default: break; case -1: fsavl_destroy(local_avl); fnvlist_free(local_nv); return (-1); } /* * We had/have the wrong origin, therefore our * list of snapshots is wrong. Need to handle * them on the next pass. */ needagain = B_TRUE; continue; } for (snapelem = nvlist_next_nvpair(snaps, NULL); snapelem; snapelem = nextsnapelem) { uint64_t thisguid; char *stream_snapname; nvlist_t *found, *props; nextsnapelem = nvlist_next_nvpair(snaps, snapelem); thisguid = fnvpair_value_uint64(snapelem); found = fsavl_find(stream_avl, thisguid, &stream_snapname); /* check for delete */ if (found == NULL) { char name[ZFS_MAX_DATASET_NAME_LEN]; if (!flags->force) continue; (void) snprintf(name, sizeof (name), "%s@%s", fsname, nvpair_name(snapelem)); error = recv_destroy(hdl, name, strlen(fsname)+1, newname, flags); if (error) needagain = B_TRUE; else progress = B_TRUE; sprintf(guidname, "%llu", (u_longlong_t)thisguid); nvlist_add_boolean(deleted, guidname); continue; } stream_nvfs = found; if (0 == nvlist_lookup_nvlist(stream_nvfs, "snapprops", &props) && 0 == nvlist_lookup_nvlist(props, stream_snapname, &props)) { zfs_cmd_t zc = {"\0"}; zc.zc_cookie = B_TRUE; /* received */ (void) snprintf(zc.zc_name, sizeof (zc.zc_name), "%s@%s", fsname, nvpair_name(snapelem)); zcmd_write_src_nvlist(hdl, &zc, props); (void) zfs_ioctl(hdl, ZFS_IOC_SET_PROP, &zc); zcmd_free_nvlists(&zc); } /* check for different snapname */ if (strcmp(nvpair_name(snapelem), stream_snapname) != 0) { char name[ZFS_MAX_DATASET_NAME_LEN]; char tryname[ZFS_MAX_DATASET_NAME_LEN]; (void) snprintf(name, sizeof (name), "%s@%s", fsname, nvpair_name(snapelem)); (void) snprintf(tryname, sizeof (name), "%s@%s", fsname, stream_snapname); error = recv_rename(hdl, name, tryname, strlen(fsname)+1, newname, flags); if (error) needagain = B_TRUE; else progress = B_TRUE; } if (strcmp(stream_snapname, fromsnap) == 0) fromguid = thisguid; } /* check for delete */ if (stream_nvfs == NULL) { if (!flags->force) continue; error = recv_destroy(hdl, fsname, strlen(tofs)+1, newname, flags); if (error) needagain = B_TRUE; else progress = B_TRUE; sprintf(guidname, "%llu", (u_longlong_t)parent_fromsnap_guid); nvlist_add_boolean(deleted, guidname); continue; } if (fromguid == 0) { if (flags->verbose) { (void) printf("local fs %s does not have " "fromsnap (%s in stream); must have " "been deleted locally; ignoring\n", fsname, fromsnap); } continue; } stream_fsname = fnvlist_lookup_string(stream_nvfs, "name"); stream_parent_fromsnap_guid = fnvlist_lookup_uint64( stream_nvfs, "parentfromsnap"); s1 = strrchr(fsname, '/'); s2 = strrchr(stream_fsname, '/'); /* * Check if we're going to rename based on parent guid change * and the current parent guid was also deleted. If it was then * rename will fail and is likely unneeded, so avoid this and * force an early retry to determine the new * parent_fromsnap_guid. */ if (stream_parent_fromsnap_guid != 0 && parent_fromsnap_guid != 0 && stream_parent_fromsnap_guid != parent_fromsnap_guid) { sprintf(guidname, "%llu", (u_longlong_t)parent_fromsnap_guid); if (nvlist_exists(deleted, guidname)) { progress = B_TRUE; needagain = B_TRUE; goto doagain; } } /* * Check for rename. If the exact receive path is specified, it * does not count as a rename, but we still need to check the * datasets beneath it. */ if ((stream_parent_fromsnap_guid != 0 && parent_fromsnap_guid != 0 && stream_parent_fromsnap_guid != parent_fromsnap_guid) || ((flags->isprefix || strcmp(tofs, fsname) != 0) && (s1 != NULL) && (s2 != NULL) && strcmp(s1, s2) != 0)) { nvlist_t *parent; char tryname[ZFS_MAX_DATASET_NAME_LEN]; parent = fsavl_find(local_avl, stream_parent_fromsnap_guid, NULL); /* * NB: parent might not be found if we used the * tosnap for stream_parent_fromsnap_guid, * because the parent is a newly-created fs; * we'll be able to rename it after we recv the * new fs. */ if (parent != NULL) { char *pname; pname = fnvlist_lookup_string(parent, "name"); (void) snprintf(tryname, sizeof (tryname), "%s%s", pname, strrchr(stream_fsname, '/')); } else { tryname[0] = '\0'; if (flags->verbose) { (void) printf("local fs %s new parent " "not found\n", fsname); } } newname[0] = '\0'; error = recv_rename(hdl, fsname, tryname, strlen(tofs)+1, newname, flags); if (renamed != NULL && newname[0] != '\0') { fnvlist_add_boolean(renamed, newname); } if (error) needagain = B_TRUE; else progress = B_TRUE; } } doagain: fsavl_destroy(local_avl); fnvlist_free(local_nv); fnvlist_free(deleted); if (needagain && progress) { /* do another pass to fix up temporary names */ if (flags->verbose) (void) printf("another pass:\n"); goto again; } return (needagain || error != 0); } static int zfs_receive_package(libzfs_handle_t *hdl, int fd, const char *destname, recvflags_t *flags, dmu_replay_record_t *drr, zio_cksum_t *zc, char **top_zfs, nvlist_t *cmdprops) { nvlist_t *stream_nv = NULL; avl_tree_t *stream_avl = NULL; char *fromsnap = NULL; char *sendsnap = NULL; char *cp; char tofs[ZFS_MAX_DATASET_NAME_LEN]; char sendfs[ZFS_MAX_DATASET_NAME_LEN]; char errbuf[ERRBUFLEN]; dmu_replay_record_t drre; int error; boolean_t anyerr = B_FALSE; boolean_t softerr = B_FALSE; boolean_t recursive, raw; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot receive")); assert(drr->drr_type == DRR_BEGIN); assert(drr->drr_u.drr_begin.drr_magic == DMU_BACKUP_MAGIC); assert(DMU_GET_STREAM_HDRTYPE(drr->drr_u.drr_begin.drr_versioninfo) == DMU_COMPOUNDSTREAM); /* * Read in the nvlist from the stream. */ if (drr->drr_payloadlen != 0) { error = recv_read_nvlist(hdl, fd, drr->drr_payloadlen, &stream_nv, flags->byteswap, zc); if (error) { error = zfs_error(hdl, EZFS_BADSTREAM, errbuf); goto out; } } recursive = (nvlist_lookup_boolean(stream_nv, "not_recursive") == ENOENT); raw = (nvlist_lookup_boolean(stream_nv, "raw") == 0); if (recursive && strchr(destname, '@')) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot specify snapshot name for multi-snapshot stream")); error = zfs_error(hdl, EZFS_BADSTREAM, errbuf); goto out; } /* * Read in the end record and verify checksum. */ if (0 != (error = recv_read(hdl, fd, &drre, sizeof (drre), flags->byteswap, NULL))) goto out; if (flags->byteswap) { drre.drr_type = BSWAP_32(drre.drr_type); drre.drr_u.drr_end.drr_checksum.zc_word[0] = BSWAP_64(drre.drr_u.drr_end.drr_checksum.zc_word[0]); drre.drr_u.drr_end.drr_checksum.zc_word[1] = BSWAP_64(drre.drr_u.drr_end.drr_checksum.zc_word[1]); drre.drr_u.drr_end.drr_checksum.zc_word[2] = BSWAP_64(drre.drr_u.drr_end.drr_checksum.zc_word[2]); drre.drr_u.drr_end.drr_checksum.zc_word[3] = BSWAP_64(drre.drr_u.drr_end.drr_checksum.zc_word[3]); } if (drre.drr_type != DRR_END) { error = zfs_error(hdl, EZFS_BADSTREAM, errbuf); goto out; } if (!ZIO_CHECKSUM_EQUAL(drre.drr_u.drr_end.drr_checksum, *zc)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "incorrect header checksum")); error = zfs_error(hdl, EZFS_BADSTREAM, errbuf); goto out; } (void) nvlist_lookup_string(stream_nv, "fromsnap", &fromsnap); if (drr->drr_payloadlen != 0) { nvlist_t *stream_fss; stream_fss = fnvlist_lookup_nvlist(stream_nv, "fss"); if ((stream_avl = fsavl_create(stream_fss)) == NULL) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "couldn't allocate avl tree")); error = zfs_error(hdl, EZFS_NOMEM, errbuf); goto out; } if (fromsnap != NULL && recursive) { nvlist_t *renamed = NULL; nvpair_t *pair = NULL; (void) strlcpy(tofs, destname, sizeof (tofs)); if (flags->isprefix) { struct drr_begin *drrb = &drr->drr_u.drr_begin; int i; if (flags->istail) { cp = strrchr(drrb->drr_toname, '/'); if (cp == NULL) { (void) strlcat(tofs, "/", sizeof (tofs)); i = 0; } else { i = (cp - drrb->drr_toname); } } else { i = strcspn(drrb->drr_toname, "/@"); } /* zfs_receive_one() will create_parents() */ (void) strlcat(tofs, &drrb->drr_toname[i], sizeof (tofs)); *strchr(tofs, '@') = '\0'; } if (!flags->dryrun && !flags->nomount) { renamed = fnvlist_alloc(); } softerr = recv_incremental_replication(hdl, tofs, flags, stream_nv, stream_avl, renamed); /* Unmount renamed filesystems before receiving. */ while ((pair = nvlist_next_nvpair(renamed, pair)) != NULL) { zfs_handle_t *zhp; prop_changelist_t *clp = NULL; zhp = zfs_open(hdl, nvpair_name(pair), ZFS_TYPE_FILESYSTEM); if (zhp != NULL) { clp = changelist_gather(zhp, ZFS_PROP_MOUNTPOINT, 0, flags->forceunmount ? MS_FORCE : 0); zfs_close(zhp); if (clp != NULL) { softerr |= changelist_prefix(clp); changelist_free(clp); } } } fnvlist_free(renamed); } } /* * Get the fs specified by the first path in the stream (the top level * specified by 'zfs send') and pass it to each invocation of * zfs_receive_one(). */ (void) strlcpy(sendfs, drr->drr_u.drr_begin.drr_toname, sizeof (sendfs)); if ((cp = strchr(sendfs, '@')) != NULL) { *cp = '\0'; /* * Find the "sendsnap", the final snapshot in a replication * stream. zfs_receive_one() handles certain errors * differently, depending on if the contained stream is the * last one or not. */ sendsnap = (cp + 1); } /* Finally, receive each contained stream */ do { /* * we should figure out if it has a recoverable * error, in which case do a recv_skip() and drive on. * Note, if we fail due to already having this guid, * zfs_receive_one() will take care of it (ie, * recv_skip() and return 0). */ error = zfs_receive_impl(hdl, destname, NULL, flags, fd, sendfs, stream_nv, stream_avl, top_zfs, sendsnap, cmdprops); if (error == ENODATA) { error = 0; break; } anyerr |= error; } while (error == 0); if (drr->drr_payloadlen != 0 && recursive && fromsnap != NULL) { /* * Now that we have the fs's they sent us, try the * renames again. */ softerr = recv_incremental_replication(hdl, tofs, flags, stream_nv, stream_avl, NULL); } if (raw && softerr == 0 && *top_zfs != NULL) { softerr = recv_fix_encryption_hierarchy(hdl, *top_zfs, stream_nv); } out: fsavl_destroy(stream_avl); fnvlist_free(stream_nv); if (softerr) error = -2; if (anyerr) error = -1; return (error); } static void trunc_prop_errs(int truncated) { ASSERT(truncated != 0); if (truncated == 1) (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "1 more property could not be set\n")); else (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "%d more properties could not be set\n"), truncated); } static int recv_skip(libzfs_handle_t *hdl, int fd, boolean_t byteswap) { dmu_replay_record_t *drr; void *buf = zfs_alloc(hdl, SPA_MAXBLOCKSIZE); uint64_t payload_size; char errbuf[ERRBUFLEN]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot receive")); /* XXX would be great to use lseek if possible... */ drr = buf; while (recv_read(hdl, fd, drr, sizeof (dmu_replay_record_t), byteswap, NULL) == 0) { if (byteswap) drr->drr_type = BSWAP_32(drr->drr_type); switch (drr->drr_type) { case DRR_BEGIN: if (drr->drr_payloadlen != 0) { (void) recv_read(hdl, fd, buf, drr->drr_payloadlen, B_FALSE, NULL); } break; case DRR_END: free(buf); return (0); case DRR_OBJECT: if (byteswap) { drr->drr_u.drr_object.drr_bonuslen = BSWAP_32(drr->drr_u.drr_object. drr_bonuslen); drr->drr_u.drr_object.drr_raw_bonuslen = BSWAP_32(drr->drr_u.drr_object. drr_raw_bonuslen); } payload_size = DRR_OBJECT_PAYLOAD_SIZE(&drr->drr_u.drr_object); (void) recv_read(hdl, fd, buf, payload_size, B_FALSE, NULL); break; case DRR_WRITE: if (byteswap) { drr->drr_u.drr_write.drr_logical_size = BSWAP_64( drr->drr_u.drr_write.drr_logical_size); drr->drr_u.drr_write.drr_compressed_size = BSWAP_64( drr->drr_u.drr_write.drr_compressed_size); } payload_size = DRR_WRITE_PAYLOAD_SIZE(&drr->drr_u.drr_write); assert(payload_size <= SPA_MAXBLOCKSIZE); (void) recv_read(hdl, fd, buf, payload_size, B_FALSE, NULL); break; case DRR_SPILL: if (byteswap) { drr->drr_u.drr_spill.drr_length = BSWAP_64(drr->drr_u.drr_spill.drr_length); drr->drr_u.drr_spill.drr_compressed_size = BSWAP_64(drr->drr_u.drr_spill. drr_compressed_size); } payload_size = DRR_SPILL_PAYLOAD_SIZE(&drr->drr_u.drr_spill); (void) recv_read(hdl, fd, buf, payload_size, B_FALSE, NULL); break; case DRR_WRITE_EMBEDDED: if (byteswap) { drr->drr_u.drr_write_embedded.drr_psize = BSWAP_32(drr->drr_u.drr_write_embedded. drr_psize); } (void) recv_read(hdl, fd, buf, P2ROUNDUP(drr->drr_u.drr_write_embedded.drr_psize, 8), B_FALSE, NULL); break; case DRR_OBJECT_RANGE: case DRR_WRITE_BYREF: case DRR_FREEOBJECTS: case DRR_FREE: break; default: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid record type")); free(buf); return (zfs_error(hdl, EZFS_BADSTREAM, errbuf)); } } free(buf); return (-1); } static void recv_ecksum_set_aux(libzfs_handle_t *hdl, const char *target_snap, boolean_t resumable, boolean_t checksum) { char target_fs[ZFS_MAX_DATASET_NAME_LEN]; zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, (checksum ? "checksum mismatch" : "incomplete stream"))); if (!resumable) return; (void) strlcpy(target_fs, target_snap, sizeof (target_fs)); *strchr(target_fs, '@') = '\0'; zfs_handle_t *zhp = zfs_open(hdl, target_fs, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); if (zhp == NULL) return; char token_buf[ZFS_MAXPROPLEN]; int error = zfs_prop_get(zhp, ZFS_PROP_RECEIVE_RESUME_TOKEN, token_buf, sizeof (token_buf), NULL, NULL, 0, B_TRUE); if (error == 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "checksum mismatch or incomplete stream.\n" "Partially received snapshot is saved.\n" "A resuming stream can be generated on the sending " "system by running:\n" " zfs send -t %s"), token_buf); } zfs_close(zhp); } /* * Prepare a new nvlist of properties that are to override (-o) or be excluded * (-x) from the received dataset * recvprops: received properties from the send stream * cmdprops: raw input properties from command line * origprops: properties, both locally-set and received, currently set on the * target dataset if it exists, NULL otherwise. * oxprops: valid output override (-o) and excluded (-x) properties */ static int zfs_setup_cmdline_props(libzfs_handle_t *hdl, zfs_type_t type, char *fsname, boolean_t zoned, boolean_t recursive, boolean_t newfs, boolean_t raw, boolean_t toplevel, nvlist_t *recvprops, nvlist_t *cmdprops, nvlist_t *origprops, nvlist_t **oxprops, uint8_t **wkeydata_out, uint_t *wkeylen_out, const char *errbuf) { nvpair_t *nvp; nvlist_t *oprops, *voprops; zfs_handle_t *zhp = NULL; zpool_handle_t *zpool_hdl = NULL; char *cp; int ret = 0; char namebuf[ZFS_MAX_DATASET_NAME_LEN]; if (nvlist_empty(cmdprops)) return (0); /* No properties to override or exclude */ *oxprops = fnvlist_alloc(); oprops = fnvlist_alloc(); strlcpy(namebuf, fsname, ZFS_MAX_DATASET_NAME_LEN); /* * Get our dataset handle. The target dataset may not exist yet. */ if (zfs_dataset_exists(hdl, namebuf, ZFS_TYPE_DATASET)) { zhp = zfs_open(hdl, namebuf, ZFS_TYPE_DATASET); if (zhp == NULL) { ret = -1; goto error; } } /* open the zpool handle */ cp = strchr(namebuf, '/'); if (cp != NULL) *cp = '\0'; zpool_hdl = zpool_open(hdl, namebuf); if (zpool_hdl == NULL) { ret = -1; goto error; } /* restore namebuf to match fsname for later use */ if (cp != NULL) *cp = '/'; /* * first iteration: process excluded (-x) properties now and gather * added (-o) properties to be later processed by zfs_valid_proplist() */ nvp = NULL; while ((nvp = nvlist_next_nvpair(cmdprops, nvp)) != NULL) { const char *name = nvpair_name(nvp); zfs_prop_t prop = zfs_name_to_prop(name); /* * It turns out, if we don't normalize "aliased" names * e.g. compress= against the "real" names (e.g. compression) * here, then setting/excluding them does not work as * intended. * * But since user-defined properties wouldn't have a valid * mapping here, we do this conditional dance. */ const char *newname = name; if (prop >= ZFS_PROP_TYPE) newname = zfs_prop_to_name(prop); /* "origin" is processed separately, don't handle it here */ if (prop == ZFS_PROP_ORIGIN) continue; /* raw streams can't override encryption properties */ if ((zfs_prop_encryption_key_param(prop) || prop == ZFS_PROP_ENCRYPTION) && raw) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "encryption property '%s' cannot " "be set or excluded for raw streams."), name); ret = zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } /* incremental streams can only exclude encryption properties */ if ((zfs_prop_encryption_key_param(prop) || prop == ZFS_PROP_ENCRYPTION) && !newfs && nvpair_type(nvp) != DATA_TYPE_BOOLEAN) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "encryption property '%s' cannot " "be set for incremental streams."), name); ret = zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } switch (nvpair_type(nvp)) { case DATA_TYPE_BOOLEAN: /* -x property */ /* * DATA_TYPE_BOOLEAN is the way we're asked to "exclude" * a property: this is done by forcing an explicit * inherit on the destination so the effective value is * not the one we received from the send stream. */ if (!zfs_prop_valid_for_type(prop, type, B_FALSE) && !zfs_prop_user(name)) { (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "Warning: %s: property '%s' does not " "apply to datasets of this type\n"), fsname, name); continue; } /* * We do this only if the property is not already * locally-set, in which case its value will take * priority over the received anyway. */ if (nvlist_exists(origprops, newname)) { nvlist_t *attrs; char *source = NULL; attrs = fnvlist_lookup_nvlist(origprops, newname); if (nvlist_lookup_string(attrs, ZPROP_SOURCE, &source) == 0 && strcmp(source, ZPROP_SOURCE_VAL_RECVD) != 0) continue; } /* * We can't force an explicit inherit on non-inheritable * properties: if we're asked to exclude this kind of * values we remove them from "recvprops" input nvlist. */ if (!zfs_prop_user(name) && /* can be inherited too */ !zfs_prop_inheritable(prop) && nvlist_exists(recvprops, newname)) fnvlist_remove(recvprops, newname); else fnvlist_add_boolean(*oxprops, newname); break; case DATA_TYPE_STRING: /* -o property=value */ /* * we're trying to override a property that does not * make sense for this type of dataset, but we don't * want to fail if the receive is recursive: this comes * in handy when the send stream contains, for * instance, a child ZVOL and we're trying to receive * it with "-o atime=on" */ if (!zfs_prop_valid_for_type(prop, type, B_FALSE) && !zfs_prop_user(name)) { if (recursive) continue; zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "property '%s' does not apply to datasets " "of this type"), name); ret = zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } fnvlist_add_string(oprops, newname, fnvpair_value_string(nvp)); break; default: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "property '%s' must be a string or boolean"), name); ret = zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } } if (toplevel) { /* convert override strings properties to native */ if ((voprops = zfs_valid_proplist(hdl, ZFS_TYPE_DATASET, oprops, zoned, zhp, zpool_hdl, B_FALSE, errbuf)) == NULL) { ret = zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } /* * zfs_crypto_create() requires the parent name. Get it * by truncating the fsname copy stored in namebuf. */ cp = strrchr(namebuf, '/'); if (cp != NULL) *cp = '\0'; if (!raw && zfs_crypto_create(hdl, namebuf, voprops, NULL, B_FALSE, wkeydata_out, wkeylen_out) != 0) { fnvlist_free(voprops); ret = zfs_error(hdl, EZFS_CRYPTOFAILED, errbuf); goto error; } /* second pass: process "-o" properties */ fnvlist_merge(*oxprops, voprops); fnvlist_free(voprops); } else { /* override props on child dataset are inherited */ nvp = NULL; while ((nvp = nvlist_next_nvpair(oprops, nvp)) != NULL) { const char *name = nvpair_name(nvp); fnvlist_add_boolean(*oxprops, name); } } error: if (zhp != NULL) zfs_close(zhp); if (zpool_hdl != NULL) zpool_close(zpool_hdl); fnvlist_free(oprops); return (ret); } /* * Restores a backup of tosnap from the file descriptor specified by infd. */ static int zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, const char *originsnap, recvflags_t *flags, dmu_replay_record_t *drr, dmu_replay_record_t *drr_noswap, const char *sendfs, nvlist_t *stream_nv, avl_tree_t *stream_avl, char **top_zfs, const char *finalsnap, nvlist_t *cmdprops) { struct timespec begin_time; int ioctl_err, ioctl_errno, err; char *cp; struct drr_begin *drrb = &drr->drr_u.drr_begin; char errbuf[ERRBUFLEN]; const char *chopprefix; boolean_t newfs = B_FALSE; boolean_t stream_wantsnewfs, stream_resumingnewfs; boolean_t newprops = B_FALSE; uint64_t read_bytes = 0; uint64_t errflags = 0; uint64_t parent_snapguid = 0; prop_changelist_t *clp = NULL; nvlist_t *snapprops_nvlist = NULL; nvlist_t *snapholds_nvlist = NULL; zprop_errflags_t prop_errflags; nvlist_t *prop_errors = NULL; boolean_t recursive; char *snapname = NULL; char destsnap[MAXPATHLEN * 2]; char origin[MAXNAMELEN] = {0}; char name[MAXPATHLEN]; char tmp_keylocation[MAXNAMELEN] = {0}; nvlist_t *rcvprops = NULL; /* props received from the send stream */ nvlist_t *oxprops = NULL; /* override (-o) and exclude (-x) props */ nvlist_t *origprops = NULL; /* original props (if destination exists) */ zfs_type_t type = ZFS_TYPE_INVALID; boolean_t toplevel = B_FALSE; boolean_t zoned = B_FALSE; boolean_t hastoken = B_FALSE; boolean_t redacted; uint8_t *wkeydata = NULL; uint_t wkeylen = 0; #ifndef CLOCK_MONOTONIC_RAW #define CLOCK_MONOTONIC_RAW CLOCK_MONOTONIC #endif clock_gettime(CLOCK_MONOTONIC_RAW, &begin_time); (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot receive")); recursive = (nvlist_lookup_boolean(stream_nv, "not_recursive") == ENOENT); /* Did the user request holds be skipped via zfs recv -k? */ boolean_t holds = flags->holds && !flags->skipholds; if (stream_avl != NULL) { char *keylocation = NULL; nvlist_t *lookup = NULL; nvlist_t *fs = fsavl_find(stream_avl, drrb->drr_toguid, &snapname); (void) nvlist_lookup_uint64(fs, "parentfromsnap", &parent_snapguid); err = nvlist_lookup_nvlist(fs, "props", &rcvprops); if (err) { rcvprops = fnvlist_alloc(); newprops = B_TRUE; } /* * The keylocation property may only be set on encryption roots, * but this dataset might not become an encryption root until * recv_fix_encryption_hierarchy() is called. That function * will fixup the keylocation anyway, so we temporarily unset * the keylocation for now to avoid any errors from the receive * ioctl. */ err = nvlist_lookup_string(rcvprops, zfs_prop_to_name(ZFS_PROP_KEYLOCATION), &keylocation); if (err == 0) { strlcpy(tmp_keylocation, keylocation, MAXNAMELEN); (void) nvlist_remove_all(rcvprops, zfs_prop_to_name(ZFS_PROP_KEYLOCATION)); } if (flags->canmountoff) { fnvlist_add_uint64(rcvprops, zfs_prop_to_name(ZFS_PROP_CANMOUNT), 0); } else if (newprops) { /* nothing in rcvprops, eliminate it */ fnvlist_free(rcvprops); rcvprops = NULL; newprops = B_FALSE; } if (0 == nvlist_lookup_nvlist(fs, "snapprops", &lookup)) { snapprops_nvlist = fnvlist_lookup_nvlist(lookup, snapname); } if (holds) { if (0 == nvlist_lookup_nvlist(fs, "snapholds", &lookup)) { snapholds_nvlist = fnvlist_lookup_nvlist( lookup, snapname); } } } cp = NULL; /* * Determine how much of the snapshot name stored in the stream * we are going to tack on to the name they specified on the * command line, and how much we are going to chop off. * * If they specified a snapshot, chop the entire name stored in * the stream. */ if (flags->istail) { /* * A filesystem was specified with -e. We want to tack on only * the tail of the sent snapshot path. */ if (strchr(tosnap, '@')) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid " "argument - snapshot not allowed with -e")); err = zfs_error(hdl, EZFS_INVALIDNAME, errbuf); goto out; } chopprefix = strrchr(sendfs, '/'); if (chopprefix == NULL) { /* * The tail is the poolname, so we need to * prepend a path separator. */ int len = strlen(drrb->drr_toname); cp = umem_alloc(len + 2, UMEM_NOFAIL); cp[0] = '/'; (void) strcpy(&cp[1], drrb->drr_toname); chopprefix = cp; } else { chopprefix = drrb->drr_toname + (chopprefix - sendfs); } } else if (flags->isprefix) { /* * A filesystem was specified with -d. We want to tack on * everything but the first element of the sent snapshot path * (all but the pool name). */ if (strchr(tosnap, '@')) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid " "argument - snapshot not allowed with -d")); err = zfs_error(hdl, EZFS_INVALIDNAME, errbuf); goto out; } chopprefix = strchr(drrb->drr_toname, '/'); if (chopprefix == NULL) chopprefix = strchr(drrb->drr_toname, '@'); } else if (strchr(tosnap, '@') == NULL) { /* * If a filesystem was specified without -d or -e, we want to * tack on everything after the fs specified by 'zfs send'. */ chopprefix = drrb->drr_toname + strlen(sendfs); } else { /* A snapshot was specified as an exact path (no -d or -e). */ if (recursive) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot specify snapshot name for multi-snapshot " "stream")); err = zfs_error(hdl, EZFS_BADSTREAM, errbuf); goto out; } chopprefix = drrb->drr_toname + strlen(drrb->drr_toname); } ASSERT(strstr(drrb->drr_toname, sendfs) == drrb->drr_toname); ASSERT(chopprefix > drrb->drr_toname || strchr(sendfs, '/') == NULL); ASSERT(chopprefix <= drrb->drr_toname + strlen(drrb->drr_toname) || strchr(sendfs, '/') == NULL); ASSERT(chopprefix[0] == '/' || chopprefix[0] == '@' || chopprefix[0] == '\0'); /* * Determine name of destination snapshot. */ (void) strlcpy(destsnap, tosnap, sizeof (destsnap)); (void) strlcat(destsnap, chopprefix, sizeof (destsnap)); if (cp != NULL) umem_free(cp, strlen(cp) + 1); if (!zfs_name_valid(destsnap, ZFS_TYPE_SNAPSHOT)) { err = zfs_error(hdl, EZFS_INVALIDNAME, errbuf); goto out; } /* * Determine the name of the origin snapshot. */ if (originsnap) { (void) strlcpy(origin, originsnap, sizeof (origin)); if (flags->verbose) (void) printf("using provided clone origin %s\n", origin); } else if (drrb->drr_flags & DRR_FLAG_CLONE) { if (guid_to_name(hdl, destsnap, drrb->drr_fromguid, B_FALSE, origin) != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "local origin for clone %s does not exist"), destsnap); err = zfs_error(hdl, EZFS_NOENT, errbuf); goto out; } if (flags->verbose) (void) printf("found clone origin %s\n", origin); } if ((DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) & DMU_BACKUP_FEATURE_DEDUP)) { (void) fprintf(stderr, gettext("ERROR: \"zfs receive\" no longer supports " "deduplicated send streams. Use\n" "the \"zstream redup\" command to convert this stream " "to a regular,\n" "non-deduplicated stream.\n")); err = zfs_error(hdl, EZFS_NOTSUP, errbuf); goto out; } boolean_t resuming = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) & DMU_BACKUP_FEATURE_RESUMING; boolean_t raw = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) & DMU_BACKUP_FEATURE_RAW; boolean_t embedded = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) & DMU_BACKUP_FEATURE_EMBED_DATA; stream_wantsnewfs = (drrb->drr_fromguid == 0 || (drrb->drr_flags & DRR_FLAG_CLONE) || originsnap) && !resuming; stream_resumingnewfs = (drrb->drr_fromguid == 0 || (drrb->drr_flags & DRR_FLAG_CLONE) || originsnap) && resuming; if (stream_wantsnewfs) { /* * if the parent fs does not exist, look for it based on * the parent snap GUID */ (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot receive new filesystem stream")); (void) strlcpy(name, destsnap, sizeof (name)); cp = strrchr(name, '/'); if (cp) *cp = '\0'; if (cp && !zfs_dataset_exists(hdl, name, ZFS_TYPE_DATASET)) { char suffix[ZFS_MAX_DATASET_NAME_LEN]; (void) strlcpy(suffix, strrchr(destsnap, '/'), sizeof (suffix)); if (guid_to_name(hdl, name, parent_snapguid, B_FALSE, destsnap) == 0) { *strchr(destsnap, '@') = '\0'; (void) strlcat(destsnap, suffix, sizeof (destsnap) - strlen(destsnap)); } } } else { /* * If the fs does not exist, look for it based on the * fromsnap GUID. */ if (resuming) { (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot receive resume stream")); } else { (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot receive incremental stream")); } (void) strlcpy(name, destsnap, sizeof (name)); *strchr(name, '@') = '\0'; /* * If the exact receive path was specified and this is the * topmost path in the stream, then if the fs does not exist we * should look no further. */ if ((flags->isprefix || (*(chopprefix = drrb->drr_toname + strlen(sendfs)) != '\0' && *chopprefix != '@')) && !zfs_dataset_exists(hdl, name, ZFS_TYPE_DATASET)) { char snap[ZFS_MAX_DATASET_NAME_LEN]; (void) strlcpy(snap, strchr(destsnap, '@'), sizeof (snap)); if (guid_to_name(hdl, name, drrb->drr_fromguid, B_FALSE, destsnap) == 0) { *strchr(destsnap, '@') = '\0'; (void) strlcat(destsnap, snap, sizeof (destsnap) - strlen(destsnap)); } } } (void) strlcpy(name, destsnap, sizeof (name)); *strchr(name, '@') = '\0'; redacted = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) & DMU_BACKUP_FEATURE_REDACTED; if (flags->heal) { if (flags->isprefix || flags->istail || flags->force || flags->canmountoff || flags->resumable || flags->nomount || flags->skipholds) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "corrective recv can not be used when combined with" " this flag")); err = zfs_error(hdl, EZFS_INVALIDNAME, errbuf); goto out; } uint64_t guid = get_snap_guid(hdl, name, strchr(destsnap, '@') + 1); if (guid == 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "corrective recv must specify an existing snapshot" " to heal")); err = zfs_error(hdl, EZFS_INVALIDNAME, errbuf); goto out; } else if (guid != drrb->drr_toguid) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "local snapshot doesn't match the snapshot" " in the provided stream")); err = zfs_error(hdl, EZFS_WRONG_PARENT, errbuf); goto out; } } else if (zfs_dataset_exists(hdl, name, ZFS_TYPE_DATASET)) { zfs_cmd_t zc = {"\0"}; zfs_handle_t *zhp = NULL; boolean_t encrypted; (void) strcpy(zc.zc_name, name); /* * Destination fs exists. It must be one of these cases: * - an incremental send stream * - the stream specifies a new fs (full stream or clone) * and they want us to blow away the existing fs (and * have therefore specified -F and removed any snapshots) * - we are resuming a failed receive. */ if (stream_wantsnewfs) { boolean_t is_volume = drrb->drr_type == DMU_OST_ZVOL; if (!flags->force) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "destination '%s' exists\n" "must specify -F to overwrite it"), name); err = zfs_error(hdl, EZFS_EXISTS, errbuf); goto out; } if (zfs_ioctl(hdl, ZFS_IOC_SNAPSHOT_LIST_NEXT, &zc) == 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "destination has snapshots (eg. %s)\n" "must destroy them to overwrite it"), zc.zc_name); err = zfs_error(hdl, EZFS_EXISTS, errbuf); goto out; } if (is_volume && strrchr(name, '/') == NULL) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "destination %s is the root dataset\n" "cannot overwrite with a ZVOL"), name); err = zfs_error(hdl, EZFS_EXISTS, errbuf); goto out; } if (is_volume && zfs_ioctl(hdl, ZFS_IOC_DATASET_LIST_NEXT, &zc) == 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "destination has children (eg. %s)\n" "cannot overwrite with a ZVOL"), zc.zc_name); err = zfs_error(hdl, EZFS_WRONG_PARENT, errbuf); goto out; } } if ((zhp = zfs_open(hdl, name, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME)) == NULL) { err = -1; goto out; } /* * When receiving full/newfs on existing dataset, then it * should be done with "-F" flag. Its enforced for initial * receive in previous checks in this function. * Similarly, on resuming full/newfs recv on existing dataset, * it should be done with "-F" flag. * * When dataset doesn't exist, then full/newfs recv is done on * newly created dataset and it's marked INCONSISTENT. But * When receiving on existing dataset, recv is first done on * %recv and its marked INCONSISTENT. Existing dataset is not * marked INCONSISTENT. * Resume of full/newfs receive with dataset not INCONSISTENT * indicates that its resuming newfs on existing dataset. So, * enforce "-F" flag in this case. */ if (stream_resumingnewfs && !zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT) && !flags->force) { zfs_close(zhp); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Resuming recv on existing destination '%s'\n" "must specify -F to overwrite it"), name); err = zfs_error(hdl, EZFS_RESUME_EXISTS, errbuf); goto out; } if (stream_wantsnewfs && zhp->zfs_dmustats.dds_origin[0]) { zfs_close(zhp); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "destination '%s' is a clone\n" "must destroy it to overwrite it"), name); err = zfs_error(hdl, EZFS_EXISTS, errbuf); goto out; } /* * Raw sends can not be performed as an incremental on top * of existing unencrypted datasets. zfs recv -F can't be * used to blow away an existing encrypted filesystem. This * is because it would require the dsl dir to point to the * new key (or lack of a key) and the old key at the same * time. The -F flag may still be used for deleting * intermediate snapshots that would otherwise prevent the * receive from working. */ encrypted = zfs_prop_get_int(zhp, ZFS_PROP_ENCRYPTION) != ZIO_CRYPT_OFF; if (!stream_wantsnewfs && !encrypted && raw) { zfs_close(zhp); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot perform raw receive on top of " "existing unencrypted dataset")); err = zfs_error(hdl, EZFS_BADRESTORE, errbuf); goto out; } if (stream_wantsnewfs && flags->force && ((raw && !encrypted) || encrypted)) { zfs_close(zhp); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "zfs receive -F cannot be used to destroy an " "encrypted filesystem or overwrite an " "unencrypted one with an encrypted one")); err = zfs_error(hdl, EZFS_BADRESTORE, errbuf); goto out; } if (!flags->dryrun && zhp->zfs_type == ZFS_TYPE_FILESYSTEM && (stream_wantsnewfs || stream_resumingnewfs)) { /* We can't do online recv in this case */ clp = changelist_gather(zhp, ZFS_PROP_NAME, 0, flags->forceunmount ? MS_FORCE : 0); if (clp == NULL) { zfs_close(zhp); err = -1; goto out; } if (changelist_prefix(clp) != 0) { changelist_free(clp); zfs_close(zhp); err = -1; goto out; } } /* * If we are resuming a newfs, set newfs here so that we will * mount it if the recv succeeds this time. We can tell * that it was a newfs on the first recv because the fs * itself will be inconsistent (if the fs existed when we * did the first recv, we would have received it into * .../%recv). */ if (resuming && zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT)) newfs = B_TRUE; /* we want to know if we're zoned when validating -o|-x props */ zoned = zfs_prop_get_int(zhp, ZFS_PROP_ZONED); /* may need this info later, get it now we have zhp around */ if (zfs_prop_get(zhp, ZFS_PROP_RECEIVE_RESUME_TOKEN, NULL, 0, NULL, NULL, 0, B_TRUE) == 0) hastoken = B_TRUE; /* gather existing properties on destination */ origprops = fnvlist_alloc(); fnvlist_merge(origprops, zhp->zfs_props); fnvlist_merge(origprops, zhp->zfs_user_props); zfs_close(zhp); } else { zfs_handle_t *zhp; /* * Destination filesystem does not exist. Therefore we better * be creating a new filesystem (either from a full backup, or * a clone). It would therefore be invalid if the user * specified only the pool name (i.e. if the destination name * contained no slash character). */ cp = strrchr(name, '/'); if (!stream_wantsnewfs || cp == NULL) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "destination '%s' does not exist"), name); err = zfs_error(hdl, EZFS_NOENT, errbuf); goto out; } /* * Trim off the final dataset component so we perform the * recvbackup ioctl to the filesystems's parent. */ *cp = '\0'; if (flags->isprefix && !flags->istail && !flags->dryrun && create_parents(hdl, destsnap, strlen(tosnap)) != 0) { err = zfs_error(hdl, EZFS_BADRESTORE, errbuf); goto out; } /* validate parent */ zhp = zfs_open(hdl, name, ZFS_TYPE_DATASET); if (zhp == NULL) { err = zfs_error(hdl, EZFS_BADRESTORE, errbuf); goto out; } if (zfs_get_type(zhp) != ZFS_TYPE_FILESYSTEM) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "parent '%s' is not a filesystem"), name); err = zfs_error(hdl, EZFS_WRONG_PARENT, errbuf); zfs_close(zhp); goto out; } zfs_close(zhp); newfs = B_TRUE; *cp = '/'; } if (flags->verbose) { (void) printf("%s %s%s stream of %s into %s\n", flags->dryrun ? "would receive" : "receiving", flags->heal ? " corrective" : "", drrb->drr_fromguid ? "incremental" : "full", drrb->drr_toname, destsnap); (void) fflush(stdout); } /* * If this is the top-level dataset, record it so we can use it * for recursive operations later. */ if (top_zfs != NULL && (*top_zfs == NULL || strcmp(*top_zfs, name) == 0)) { toplevel = B_TRUE; if (*top_zfs == NULL) *top_zfs = zfs_strdup(hdl, name); } if (drrb->drr_type == DMU_OST_ZVOL) { type = ZFS_TYPE_VOLUME; } else if (drrb->drr_type == DMU_OST_ZFS) { type = ZFS_TYPE_FILESYSTEM; } else { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid record type: 0x%d"), drrb->drr_type); err = zfs_error(hdl, EZFS_BADSTREAM, errbuf); goto out; } if ((err = zfs_setup_cmdline_props(hdl, type, name, zoned, recursive, stream_wantsnewfs, raw, toplevel, rcvprops, cmdprops, origprops, &oxprops, &wkeydata, &wkeylen, errbuf)) != 0) goto out; /* * When sending with properties (zfs send -p), the encryption property * is not included because it is a SETONCE property and therefore * treated as read only. However, we are always able to determine its * value because raw sends will include it in the DRR_BDEGIN payload * and non-raw sends with properties are not allowed for encrypted * datasets. Therefore, if this is a non-raw properties stream, we can * infer that the value should be ZIO_CRYPT_OFF and manually add that * to the received properties. */ if (stream_wantsnewfs && !raw && rcvprops != NULL && !nvlist_exists(cmdprops, zfs_prop_to_name(ZFS_PROP_ENCRYPTION))) { if (oxprops == NULL) oxprops = fnvlist_alloc(); fnvlist_add_uint64(oxprops, zfs_prop_to_name(ZFS_PROP_ENCRYPTION), ZIO_CRYPT_OFF); } if (flags->dryrun) { void *buf = zfs_alloc(hdl, SPA_MAXBLOCKSIZE); /* * We have read the DRR_BEGIN record, but we have * not yet read the payload. For non-dryrun sends * this will be done by the kernel, so we must * emulate that here, before attempting to read * more records. */ err = recv_read(hdl, infd, buf, drr->drr_payloadlen, flags->byteswap, NULL); free(buf); if (err != 0) goto out; err = recv_skip(hdl, infd, flags->byteswap); goto out; } if (flags->heal) { err = ioctl_err = lzc_receive_with_heal(destsnap, rcvprops, oxprops, wkeydata, wkeylen, origin, flags->force, flags->heal, flags->resumable, raw, infd, drr_noswap, -1, &read_bytes, &errflags, NULL, &prop_errors); } else { err = ioctl_err = lzc_receive_with_cmdprops(destsnap, rcvprops, oxprops, wkeydata, wkeylen, origin, flags->force, flags->resumable, raw, infd, drr_noswap, -1, &read_bytes, &errflags, NULL, &prop_errors); } ioctl_errno = ioctl_err; prop_errflags = errflags; if (err == 0) { nvpair_t *prop_err = NULL; while ((prop_err = nvlist_next_nvpair(prop_errors, prop_err)) != NULL) { char tbuf[1024]; zfs_prop_t prop; int intval; prop = zfs_name_to_prop(nvpair_name(prop_err)); (void) nvpair_value_int32(prop_err, &intval); if (strcmp(nvpair_name(prop_err), ZPROP_N_MORE_ERRORS) == 0) { trunc_prop_errs(intval); break; } else if (snapname == NULL || finalsnap == NULL || strcmp(finalsnap, snapname) == 0 || strcmp(nvpair_name(prop_err), zfs_prop_to_name(ZFS_PROP_REFQUOTA)) != 0) { /* * Skip the special case of, for example, * "refquota", errors on intermediate * snapshots leading up to a final one. * That's why we have all of the checks above. * * See zfs_ioctl.c's extract_delay_props() for * a list of props which can fail on * intermediate snapshots, but shouldn't * affect the overall receive. */ (void) snprintf(tbuf, sizeof (tbuf), dgettext(TEXT_DOMAIN, "cannot receive %s property on %s"), nvpair_name(prop_err), name); zfs_setprop_error(hdl, prop, intval, tbuf); } } } if (err == 0 && snapprops_nvlist) { zfs_cmd_t zc = {"\0"}; (void) strlcpy(zc.zc_name, destsnap, sizeof (zc.zc_name)); zc.zc_cookie = B_TRUE; /* received */ zcmd_write_src_nvlist(hdl, &zc, snapprops_nvlist); (void) zfs_ioctl(hdl, ZFS_IOC_SET_PROP, &zc); zcmd_free_nvlists(&zc); } if (err == 0 && snapholds_nvlist) { nvpair_t *pair; nvlist_t *holds, *errors = NULL; int cleanup_fd = -1; VERIFY(0 == nvlist_alloc(&holds, 0, KM_SLEEP)); for (pair = nvlist_next_nvpair(snapholds_nvlist, NULL); pair != NULL; pair = nvlist_next_nvpair(snapholds_nvlist, pair)) { fnvlist_add_string(holds, destsnap, nvpair_name(pair)); } (void) lzc_hold(holds, cleanup_fd, &errors); fnvlist_free(snapholds_nvlist); fnvlist_free(holds); } if (err && (ioctl_errno == ENOENT || ioctl_errno == EEXIST)) { /* * It may be that this snapshot already exists, * in which case we want to consume & ignore it * rather than failing. */ avl_tree_t *local_avl; nvlist_t *local_nv, *fs; cp = strchr(destsnap, '@'); /* * XXX Do this faster by just iterating over snaps in * this fs. Also if zc_value does not exist, we will * get a strange "does not exist" error message. */ *cp = '\0'; if (gather_nvlist(hdl, destsnap, NULL, NULL, B_FALSE, B_TRUE, B_FALSE, B_FALSE, B_FALSE, B_FALSE, B_FALSE, B_FALSE, B_TRUE, &local_nv, &local_avl) == 0) { *cp = '@'; fs = fsavl_find(local_avl, drrb->drr_toguid, NULL); fsavl_destroy(local_avl); fnvlist_free(local_nv); if (fs != NULL) { if (flags->verbose) { (void) printf("snap %s already exists; " "ignoring\n", destsnap); } err = ioctl_err = recv_skip(hdl, infd, flags->byteswap); } } *cp = '@'; } if (ioctl_err != 0) { switch (ioctl_errno) { case ENODEV: cp = strchr(destsnap, '@'); *cp = '\0'; zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "most recent snapshot of %s does not\n" "match incremental source"), destsnap); (void) zfs_error(hdl, EZFS_BADRESTORE, errbuf); *cp = '@'; break; case ETXTBSY: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "destination %s has been modified\n" "since most recent snapshot"), name); (void) zfs_error(hdl, EZFS_BADRESTORE, errbuf); break; case EACCES: if (flags->heal) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "key must be loaded to do a non-raw " "corrective recv on an encrypted " "dataset.")); } else if (raw && stream_wantsnewfs) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "failed to create encryption key")); } else if (raw && !stream_wantsnewfs) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "encryption key does not match " "existing key")); } else { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "inherited key must be loaded")); } (void) zfs_error(hdl, EZFS_CRYPTOFAILED, errbuf); break; case EEXIST: cp = strchr(destsnap, '@'); if (newfs) { /* it's the containing fs that exists */ *cp = '\0'; } zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "destination already exists")); (void) zfs_error_fmt(hdl, EZFS_EXISTS, dgettext(TEXT_DOMAIN, "cannot restore to %s"), destsnap); *cp = '@'; break; case EINVAL: if (flags->resumable) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "kernel modules must be upgraded to " "receive this stream.")); } else if (embedded && !raw) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "incompatible embedded data stream " "feature with encrypted receive.")); } (void) zfs_error(hdl, EZFS_BADSTREAM, errbuf); break; case ECKSUM: case ZFS_ERR_STREAM_TRUNCATED: if (flags->heal) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "corrective receive was not able to " "reconstruct the data needed for " "healing.")); else recv_ecksum_set_aux(hdl, destsnap, flags->resumable, ioctl_err == ECKSUM); (void) zfs_error(hdl, EZFS_BADSTREAM, errbuf); break; case ZFS_ERR_STREAM_LARGE_BLOCK_MISMATCH: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "incremental send stream requires -L " "(--large-block), to match previous receive.")); (void) zfs_error(hdl, EZFS_BADSTREAM, errbuf); break; case ENOTSUP: if (flags->heal) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "stream is not compatible with the " "data in the pool.")); else zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool must be upgraded to receive this " "stream.")); (void) zfs_error(hdl, EZFS_BADVERSION, errbuf); break; case EDQUOT: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "destination %s space quota exceeded."), name); (void) zfs_error(hdl, EZFS_NOSPC, errbuf); break; case ZFS_ERR_FROM_IVSET_GUID_MISSING: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "IV set guid missing. See errata %u at " "https://openzfs.github.io/openzfs-docs/msg/" "ZFS-8000-ER."), ZPOOL_ERRATA_ZOL_8308_ENCRYPTION); (void) zfs_error(hdl, EZFS_BADSTREAM, errbuf); break; case ZFS_ERR_FROM_IVSET_GUID_MISMATCH: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "IV set guid mismatch. See the 'zfs receive' " "man page section\n discussing the limitations " "of raw encrypted send streams.")); (void) zfs_error(hdl, EZFS_BADSTREAM, errbuf); break; case ZFS_ERR_SPILL_BLOCK_FLAG_MISSING: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Spill block flag missing for raw send.\n" "The zfs software on the sending system must " "be updated.")); (void) zfs_error(hdl, EZFS_BADSTREAM, errbuf); break; case ZFS_ERR_RESUME_EXISTS: cp = strchr(destsnap, '@'); if (newfs) { /* it's the containing fs that exists */ *cp = '\0'; } zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Resuming recv on existing dataset without force")); (void) zfs_error_fmt(hdl, EZFS_RESUME_EXISTS, dgettext(TEXT_DOMAIN, "cannot resume recv %s"), destsnap); *cp = '@'; break; case EBUSY: if (hastoken) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "destination %s contains " "partially-complete state from " "\"zfs receive -s\"."), name); (void) zfs_error(hdl, EZFS_BUSY, errbuf); break; } zfs_fallthrough; default: (void) zfs_standard_error(hdl, ioctl_errno, errbuf); } } /* * Mount the target filesystem (if created). Also mount any * children of the target filesystem if we did a replication * receive (indicated by stream_avl being non-NULL). */ if (clp) { if (!flags->nomount) err |= changelist_postfix(clp); changelist_free(clp); } if ((newfs || stream_avl) && type == ZFS_TYPE_FILESYSTEM && !redacted) flags->domount = B_TRUE; if (prop_errflags & ZPROP_ERR_NOCLEAR) { (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "Warning: " "failed to clear unreceived properties on %s"), name); (void) fprintf(stderr, "\n"); } if (prop_errflags & ZPROP_ERR_NORESTORE) { (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "Warning: " "failed to restore original properties on %s"), name); (void) fprintf(stderr, "\n"); } if (err || ioctl_err) { err = -1; goto out; } if (flags->verbose) { char buf1[64]; char buf2[64]; uint64_t bytes = read_bytes; struct timespec delta; clock_gettime(CLOCK_MONOTONIC_RAW, &delta); if (begin_time.tv_nsec > delta.tv_nsec) { delta.tv_nsec = 1000000000 + delta.tv_nsec - begin_time.tv_nsec; delta.tv_sec -= 1; } else delta.tv_nsec -= begin_time.tv_nsec; delta.tv_sec -= begin_time.tv_sec; if (delta.tv_sec == 0 && delta.tv_nsec == 0) delta.tv_nsec = 1; double delta_f = delta.tv_sec + (delta.tv_nsec / 1e9); zfs_nicebytes(bytes, buf1, sizeof (buf1)); zfs_nicebytes(bytes / delta_f, buf2, sizeof (buf2)); (void) printf("received %s stream in %.2f seconds (%s/sec)\n", buf1, delta_f, buf2); } err = 0; out: if (prop_errors != NULL) fnvlist_free(prop_errors); if (tmp_keylocation[0] != '\0') { fnvlist_add_string(rcvprops, zfs_prop_to_name(ZFS_PROP_KEYLOCATION), tmp_keylocation); } if (newprops) fnvlist_free(rcvprops); fnvlist_free(oxprops); fnvlist_free(origprops); return (err); } /* * Check properties we were asked to override (both -o|-x) */ static boolean_t zfs_receive_checkprops(libzfs_handle_t *hdl, nvlist_t *props, const char *errbuf) { nvpair_t *nvp = NULL; zfs_prop_t prop; const char *name; while ((nvp = nvlist_next_nvpair(props, nvp)) != NULL) { name = nvpair_name(nvp); prop = zfs_name_to_prop(name); if (prop == ZPROP_USERPROP) { if (!zfs_prop_user(name)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "%s: invalid property '%s'"), errbuf, name); return (B_FALSE); } continue; } /* * "origin" is readonly but is used to receive datasets as * clones so we don't raise an error here */ if (prop == ZFS_PROP_ORIGIN) continue; /* encryption params have their own verification later */ if (prop == ZFS_PROP_ENCRYPTION || zfs_prop_encryption_key_param(prop)) continue; /* * cannot override readonly, set-once and other specific * settable properties */ if (zfs_prop_readonly(prop) || prop == ZFS_PROP_VERSION || prop == ZFS_PROP_VOLSIZE) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "%s: invalid property '%s'"), errbuf, name); return (B_FALSE); } } return (B_TRUE); } static int zfs_receive_impl(libzfs_handle_t *hdl, const char *tosnap, const char *originsnap, recvflags_t *flags, int infd, const char *sendfs, nvlist_t *stream_nv, avl_tree_t *stream_avl, char **top_zfs, const char *finalsnap, nvlist_t *cmdprops) { int err; dmu_replay_record_t drr, drr_noswap; struct drr_begin *drrb = &drr.drr_u.drr_begin; char errbuf[ERRBUFLEN]; zio_cksum_t zcksum = { { 0 } }; uint64_t featureflags; int hdrtype; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot receive")); /* check cmdline props, raise an error if they cannot be received */ if (!zfs_receive_checkprops(hdl, cmdprops, errbuf)) return (zfs_error(hdl, EZFS_BADPROP, errbuf)); if (flags->isprefix && !zfs_dataset_exists(hdl, tosnap, ZFS_TYPE_DATASET)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "specified fs " "(%s) does not exist"), tosnap); return (zfs_error(hdl, EZFS_NOENT, errbuf)); } if (originsnap && !zfs_dataset_exists(hdl, originsnap, ZFS_TYPE_DATASET)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "specified origin fs " "(%s) does not exist"), originsnap); return (zfs_error(hdl, EZFS_NOENT, errbuf)); } /* read in the BEGIN record */ if (0 != (err = recv_read(hdl, infd, &drr, sizeof (drr), B_FALSE, &zcksum))) return (err); if (drr.drr_type == DRR_END || drr.drr_type == BSWAP_32(DRR_END)) { /* It's the double end record at the end of a package */ return (ENODATA); } /* the kernel needs the non-byteswapped begin record */ drr_noswap = drr; flags->byteswap = B_FALSE; if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) { /* * We computed the checksum in the wrong byteorder in * recv_read() above; do it again correctly. */ memset(&zcksum, 0, sizeof (zio_cksum_t)); fletcher_4_incremental_byteswap(&drr, sizeof (drr), &zcksum); flags->byteswap = B_TRUE; drr.drr_type = BSWAP_32(drr.drr_type); drr.drr_payloadlen = BSWAP_32(drr.drr_payloadlen); drrb->drr_magic = BSWAP_64(drrb->drr_magic); drrb->drr_versioninfo = BSWAP_64(drrb->drr_versioninfo); drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time); drrb->drr_type = BSWAP_32(drrb->drr_type); drrb->drr_flags = BSWAP_32(drrb->drr_flags); drrb->drr_toguid = BSWAP_64(drrb->drr_toguid); drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid); } if (drrb->drr_magic != DMU_BACKUP_MAGIC || drr.drr_type != DRR_BEGIN) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid " "stream (bad magic number)")); return (zfs_error(hdl, EZFS_BADSTREAM, errbuf)); } featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo); hdrtype = DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo); if (!DMU_STREAM_SUPPORTED(featureflags) || (hdrtype != DMU_SUBSTREAM && hdrtype != DMU_COMPOUNDSTREAM)) { /* * Let's be explicit about this one, since rather than * being a new feature we can't know, it's an old * feature we dropped. */ if (featureflags & DMU_BACKUP_FEATURE_DEDUP) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "stream has deprecated feature: dedup, try " "'zstream redup [send in a file] | zfs recv " "[...]'")); } else { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "stream has unsupported feature, feature flags = " "%llx (unknown flags = %llx)"), (u_longlong_t)featureflags, (u_longlong_t)((featureflags) & ~DMU_BACKUP_FEATURE_MASK)); } return (zfs_error(hdl, EZFS_BADSTREAM, errbuf)); } /* Holds feature is set once in the compound stream header. */ if (featureflags & DMU_BACKUP_FEATURE_HOLDS) flags->holds = B_TRUE; if (strchr(drrb->drr_toname, '@') == NULL) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid " "stream (bad snapshot name)")); return (zfs_error(hdl, EZFS_BADSTREAM, errbuf)); } if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) == DMU_SUBSTREAM) { char nonpackage_sendfs[ZFS_MAX_DATASET_NAME_LEN]; if (sendfs == NULL) { /* * We were not called from zfs_receive_package(). Get * the fs specified by 'zfs send'. */ char *cp; (void) strlcpy(nonpackage_sendfs, drr.drr_u.drr_begin.drr_toname, sizeof (nonpackage_sendfs)); if ((cp = strchr(nonpackage_sendfs, '@')) != NULL) *cp = '\0'; sendfs = nonpackage_sendfs; VERIFY(finalsnap == NULL); } return (zfs_receive_one(hdl, infd, tosnap, originsnap, flags, &drr, &drr_noswap, sendfs, stream_nv, stream_avl, top_zfs, finalsnap, cmdprops)); } else { assert(DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) == DMU_COMPOUNDSTREAM); return (zfs_receive_package(hdl, infd, tosnap, flags, &drr, &zcksum, top_zfs, cmdprops)); } } /* * Restores a backup of tosnap from the file descriptor specified by infd. * Return 0 on total success, -2 if some things couldn't be * destroyed/renamed/promoted, -1 if some things couldn't be received. * (-1 will override -2, if -1 and the resumable flag was specified the * transfer can be resumed if the sending side supports it). */ int zfs_receive(libzfs_handle_t *hdl, const char *tosnap, nvlist_t *props, recvflags_t *flags, int infd, avl_tree_t *stream_avl) { char *top_zfs = NULL; int err; struct stat sb; char *originsnap = NULL; /* * The only way fstat can fail is if we do not have a valid file * descriptor. */ if (fstat(infd, &sb) == -1) { perror("fstat"); return (-2); } if (props) { err = nvlist_lookup_string(props, "origin", &originsnap); if (err && err != ENOENT) return (err); } err = zfs_receive_impl(hdl, tosnap, originsnap, flags, infd, NULL, NULL, stream_avl, &top_zfs, NULL, props); if (err == 0 && !flags->nomount && flags->domount && top_zfs) { zfs_handle_t *zhp = NULL; prop_changelist_t *clp = NULL; zhp = zfs_open(hdl, top_zfs, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); if (zhp == NULL) { err = -1; goto out; } else { if (zhp->zfs_type == ZFS_TYPE_VOLUME) { zfs_close(zhp); goto out; } clp = changelist_gather(zhp, ZFS_PROP_MOUNTPOINT, CL_GATHER_MOUNT_ALWAYS, flags->forceunmount ? MS_FORCE : 0); zfs_close(zhp); if (clp == NULL) { err = -1; goto out; } /* mount and share received datasets */ err = changelist_postfix(clp); changelist_free(clp); if (err != 0) err = -1; } } out: if (top_zfs) free(top_zfs); return (err); } diff --git a/lib/libzfs/libzfs_util.c b/lib/libzfs/libzfs_util.c index 28dd4f426a96..b4679dbb36fd 100644 --- a/lib/libzfs/libzfs_util.c +++ b/lib/libzfs/libzfs_util.c @@ -1,2042 +1,2042 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2020 Joyent, Inc. All rights reserved. * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright 2016 Igor Kozhukhov * Copyright (c) 2017 Datto Inc. * Copyright (c) 2020 The FreeBSD Foundation * * Portions of this software were developed by Allan Jude * under sponsorship from the FreeBSD Foundation. */ /* * Internal utility routines for the ZFS library. */ #include #include #include #include #include #include #include #include #include #if LIBFETCH_DYNAMIC #include #endif #include #include #include #include #include #include #include #include "libzfs_impl.h" #include "zfs_prop.h" #include "zfeature_common.h" #include #include /* * We only care about the scheme in order to match the scheme * with the handler. Each handler should validate the full URI * as necessary. */ #define URI_REGEX "^\\([A-Za-z][A-Za-z0-9+.\\-]*\\):" int libzfs_errno(libzfs_handle_t *hdl) { return (hdl->libzfs_error); } const char * libzfs_error_action(libzfs_handle_t *hdl) { return (hdl->libzfs_action); } const char * libzfs_error_description(libzfs_handle_t *hdl) { if (hdl->libzfs_desc[0] != '\0') return (hdl->libzfs_desc); switch (hdl->libzfs_error) { case EZFS_NOMEM: return (dgettext(TEXT_DOMAIN, "out of memory")); case EZFS_BADPROP: return (dgettext(TEXT_DOMAIN, "invalid property value")); case EZFS_PROPREADONLY: return (dgettext(TEXT_DOMAIN, "read-only property")); case EZFS_PROPTYPE: return (dgettext(TEXT_DOMAIN, "property doesn't apply to " "datasets of this type")); case EZFS_PROPNONINHERIT: return (dgettext(TEXT_DOMAIN, "property cannot be inherited")); case EZFS_PROPSPACE: return (dgettext(TEXT_DOMAIN, "invalid quota or reservation")); case EZFS_BADTYPE: return (dgettext(TEXT_DOMAIN, "operation not applicable to " "datasets of this type")); case EZFS_BUSY: return (dgettext(TEXT_DOMAIN, "pool or dataset is busy")); case EZFS_EXISTS: return (dgettext(TEXT_DOMAIN, "pool or dataset exists")); case EZFS_NOENT: return (dgettext(TEXT_DOMAIN, "no such pool or dataset")); case EZFS_BADSTREAM: return (dgettext(TEXT_DOMAIN, "invalid backup stream")); case EZFS_DSREADONLY: return (dgettext(TEXT_DOMAIN, "dataset is read-only")); case EZFS_VOLTOOBIG: return (dgettext(TEXT_DOMAIN, "volume size exceeds limit for " "this system")); case EZFS_INVALIDNAME: return (dgettext(TEXT_DOMAIN, "invalid name")); case EZFS_BADRESTORE: return (dgettext(TEXT_DOMAIN, "unable to restore to " "destination")); case EZFS_BADBACKUP: return (dgettext(TEXT_DOMAIN, "backup failed")); case EZFS_BADTARGET: return (dgettext(TEXT_DOMAIN, "invalid target vdev")); case EZFS_NODEVICE: return (dgettext(TEXT_DOMAIN, "no such device in pool")); case EZFS_BADDEV: return (dgettext(TEXT_DOMAIN, "invalid device")); case EZFS_NOREPLICAS: return (dgettext(TEXT_DOMAIN, "no valid replicas")); case EZFS_RESILVERING: return (dgettext(TEXT_DOMAIN, "currently resilvering")); case EZFS_BADVERSION: return (dgettext(TEXT_DOMAIN, "unsupported version or " "feature")); case EZFS_POOLUNAVAIL: return (dgettext(TEXT_DOMAIN, "pool is unavailable")); case EZFS_DEVOVERFLOW: return (dgettext(TEXT_DOMAIN, "too many devices in one vdev")); case EZFS_BADPATH: return (dgettext(TEXT_DOMAIN, "must be an absolute path")); case EZFS_CROSSTARGET: return (dgettext(TEXT_DOMAIN, "operation crosses datasets or " "pools")); case EZFS_ZONED: return (dgettext(TEXT_DOMAIN, "dataset in use by local zone")); case EZFS_MOUNTFAILED: return (dgettext(TEXT_DOMAIN, "mount failed")); case EZFS_UMOUNTFAILED: return (dgettext(TEXT_DOMAIN, "unmount failed")); case EZFS_UNSHARENFSFAILED: return (dgettext(TEXT_DOMAIN, "NFS share removal failed")); case EZFS_SHARENFSFAILED: return (dgettext(TEXT_DOMAIN, "NFS share creation failed")); case EZFS_UNSHARESMBFAILED: return (dgettext(TEXT_DOMAIN, "SMB share removal failed")); case EZFS_SHARESMBFAILED: return (dgettext(TEXT_DOMAIN, "SMB share creation failed")); case EZFS_PERM: return (dgettext(TEXT_DOMAIN, "permission denied")); case EZFS_NOSPC: return (dgettext(TEXT_DOMAIN, "out of space")); case EZFS_FAULT: return (dgettext(TEXT_DOMAIN, "bad address")); case EZFS_IO: return (dgettext(TEXT_DOMAIN, "I/O error")); case EZFS_INTR: return (dgettext(TEXT_DOMAIN, "signal received")); case EZFS_CKSUM: return (dgettext(TEXT_DOMAIN, "insufficient replicas")); case EZFS_ISSPARE: return (dgettext(TEXT_DOMAIN, "device is reserved as a hot " "spare")); case EZFS_INVALCONFIG: return (dgettext(TEXT_DOMAIN, "invalid vdev configuration")); case EZFS_RECURSIVE: return (dgettext(TEXT_DOMAIN, "recursive dataset dependency")); case EZFS_NOHISTORY: return (dgettext(TEXT_DOMAIN, "no history available")); case EZFS_POOLPROPS: return (dgettext(TEXT_DOMAIN, "failed to retrieve " "pool properties")); case EZFS_POOL_NOTSUP: return (dgettext(TEXT_DOMAIN, "operation not supported " "on this type of pool")); case EZFS_POOL_INVALARG: return (dgettext(TEXT_DOMAIN, "invalid argument for " "this pool operation")); case EZFS_NAMETOOLONG: return (dgettext(TEXT_DOMAIN, "dataset name is too long")); case EZFS_OPENFAILED: return (dgettext(TEXT_DOMAIN, "open failed")); case EZFS_NOCAP: return (dgettext(TEXT_DOMAIN, "disk capacity information could not be retrieved")); case EZFS_LABELFAILED: return (dgettext(TEXT_DOMAIN, "write of label failed")); case EZFS_BADWHO: return (dgettext(TEXT_DOMAIN, "invalid user/group")); case EZFS_BADPERM: return (dgettext(TEXT_DOMAIN, "invalid permission")); case EZFS_BADPERMSET: return (dgettext(TEXT_DOMAIN, "invalid permission set name")); case EZFS_NODELEGATION: return (dgettext(TEXT_DOMAIN, "delegated administration is " "disabled on pool")); case EZFS_BADCACHE: return (dgettext(TEXT_DOMAIN, "invalid or missing cache file")); case EZFS_ISL2CACHE: return (dgettext(TEXT_DOMAIN, "device is in use as a cache")); case EZFS_VDEVNOTSUP: return (dgettext(TEXT_DOMAIN, "vdev specification is not " "supported")); case EZFS_NOTSUP: return (dgettext(TEXT_DOMAIN, "operation not supported " "on this dataset")); case EZFS_IOC_NOTSUPPORTED: return (dgettext(TEXT_DOMAIN, "operation not supported by " "zfs kernel module")); case EZFS_ACTIVE_SPARE: return (dgettext(TEXT_DOMAIN, "pool has active shared spare " "device")); case EZFS_UNPLAYED_LOGS: return (dgettext(TEXT_DOMAIN, "log device has unplayed intent " "logs")); case EZFS_REFTAG_RELE: return (dgettext(TEXT_DOMAIN, "no such tag on this dataset")); case EZFS_REFTAG_HOLD: return (dgettext(TEXT_DOMAIN, "tag already exists on this " "dataset")); case EZFS_TAGTOOLONG: return (dgettext(TEXT_DOMAIN, "tag too long")); case EZFS_PIPEFAILED: return (dgettext(TEXT_DOMAIN, "pipe create failed")); case EZFS_THREADCREATEFAILED: return (dgettext(TEXT_DOMAIN, "thread create failed")); case EZFS_POSTSPLIT_ONLINE: return (dgettext(TEXT_DOMAIN, "disk was split from this pool " "into a new one")); case EZFS_SCRUB_PAUSED: return (dgettext(TEXT_DOMAIN, "scrub is paused; " "use 'zpool scrub' to resume")); case EZFS_SCRUBBING: return (dgettext(TEXT_DOMAIN, "currently scrubbing; " "use 'zpool scrub -s' to cancel current scrub")); case EZFS_NO_SCRUB: return (dgettext(TEXT_DOMAIN, "there is no active scrub")); case EZFS_DIFF: return (dgettext(TEXT_DOMAIN, "unable to generate diffs")); case EZFS_DIFFDATA: return (dgettext(TEXT_DOMAIN, "invalid diff data")); case EZFS_POOLREADONLY: return (dgettext(TEXT_DOMAIN, "pool is read-only")); case EZFS_NO_PENDING: return (dgettext(TEXT_DOMAIN, "operation is not " "in progress")); case EZFS_CHECKPOINT_EXISTS: return (dgettext(TEXT_DOMAIN, "checkpoint exists")); case EZFS_DISCARDING_CHECKPOINT: return (dgettext(TEXT_DOMAIN, "currently discarding " "checkpoint")); case EZFS_NO_CHECKPOINT: return (dgettext(TEXT_DOMAIN, "checkpoint does not exist")); case EZFS_DEVRM_IN_PROGRESS: return (dgettext(TEXT_DOMAIN, "device removal in progress")); case EZFS_VDEV_TOO_BIG: return (dgettext(TEXT_DOMAIN, "device exceeds supported size")); case EZFS_ACTIVE_POOL: return (dgettext(TEXT_DOMAIN, "pool is imported on a " "different host")); case EZFS_CRYPTOFAILED: return (dgettext(TEXT_DOMAIN, "encryption failure")); case EZFS_TOOMANY: return (dgettext(TEXT_DOMAIN, "argument list too long")); case EZFS_INITIALIZING: return (dgettext(TEXT_DOMAIN, "currently initializing")); case EZFS_NO_INITIALIZE: return (dgettext(TEXT_DOMAIN, "there is no active " "initialization")); case EZFS_WRONG_PARENT: return (dgettext(TEXT_DOMAIN, "invalid parent dataset")); case EZFS_TRIMMING: return (dgettext(TEXT_DOMAIN, "currently trimming")); case EZFS_NO_TRIM: return (dgettext(TEXT_DOMAIN, "there is no active trim")); case EZFS_TRIM_NOTSUP: return (dgettext(TEXT_DOMAIN, "trim operations are not " "supported by this device")); case EZFS_NO_RESILVER_DEFER: return (dgettext(TEXT_DOMAIN, "this action requires the " "resilver_defer feature")); case EZFS_EXPORT_IN_PROGRESS: return (dgettext(TEXT_DOMAIN, "pool export in progress")); case EZFS_REBUILDING: return (dgettext(TEXT_DOMAIN, "currently sequentially " "resilvering")); case EZFS_VDEV_NOTSUP: return (dgettext(TEXT_DOMAIN, "operation not supported " "on this type of vdev")); case EZFS_NOT_USER_NAMESPACE: return (dgettext(TEXT_DOMAIN, "the provided file " "was not a user namespace file")); case EZFS_RESUME_EXISTS: return (dgettext(TEXT_DOMAIN, "Resuming recv on existing " "dataset without force")); case EZFS_UNKNOWN: return (dgettext(TEXT_DOMAIN, "unknown error")); default: assert(hdl->libzfs_error == 0); return (dgettext(TEXT_DOMAIN, "no error")); } } void zfs_error_aux(libzfs_handle_t *hdl, const char *fmt, ...) { va_list ap; va_start(ap, fmt); (void) vsnprintf(hdl->libzfs_desc, sizeof (hdl->libzfs_desc), fmt, ap); hdl->libzfs_desc_active = 1; va_end(ap); } static void zfs_verror(libzfs_handle_t *hdl, int error, const char *fmt, va_list ap) { (void) vsnprintf(hdl->libzfs_action, sizeof (hdl->libzfs_action), fmt, ap); hdl->libzfs_error = error; if (hdl->libzfs_desc_active) hdl->libzfs_desc_active = 0; else hdl->libzfs_desc[0] = '\0'; if (hdl->libzfs_printerr) { if (error == EZFS_UNKNOWN) { (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "internal " "error: %s: %s\n"), hdl->libzfs_action, libzfs_error_description(hdl)); abort(); } (void) fprintf(stderr, "%s: %s\n", hdl->libzfs_action, libzfs_error_description(hdl)); if (error == EZFS_NOMEM) exit(1); } } int zfs_error(libzfs_handle_t *hdl, int error, const char *msg) { return (zfs_error_fmt(hdl, error, "%s", msg)); } int zfs_error_fmt(libzfs_handle_t *hdl, int error, const char *fmt, ...) { va_list ap; va_start(ap, fmt); zfs_verror(hdl, error, fmt, ap); va_end(ap); return (-1); } static int zfs_common_error(libzfs_handle_t *hdl, int error, const char *fmt, va_list ap) { switch (error) { case EPERM: case EACCES: zfs_verror(hdl, EZFS_PERM, fmt, ap); return (-1); case ECANCELED: zfs_verror(hdl, EZFS_NODELEGATION, fmt, ap); return (-1); case EIO: zfs_verror(hdl, EZFS_IO, fmt, ap); return (-1); case EFAULT: zfs_verror(hdl, EZFS_FAULT, fmt, ap); return (-1); case EINTR: zfs_verror(hdl, EZFS_INTR, fmt, ap); return (-1); case ECKSUM: zfs_verror(hdl, EZFS_CKSUM, fmt, ap); return (-1); } return (0); } int zfs_standard_error(libzfs_handle_t *hdl, int error, const char *msg) { return (zfs_standard_error_fmt(hdl, error, "%s", msg)); } int zfs_standard_error_fmt(libzfs_handle_t *hdl, int error, const char *fmt, ...) { va_list ap; va_start(ap, fmt); if (zfs_common_error(hdl, error, fmt, ap) != 0) { va_end(ap); return (-1); } switch (error) { case ENXIO: case ENODEV: case EPIPE: zfs_verror(hdl, EZFS_IO, fmt, ap); break; case ENOENT: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "dataset does not exist")); zfs_verror(hdl, EZFS_NOENT, fmt, ap); break; case ENOSPC: case EDQUOT: zfs_verror(hdl, EZFS_NOSPC, fmt, ap); break; case EEXIST: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "dataset already exists")); zfs_verror(hdl, EZFS_EXISTS, fmt, ap); break; case EBUSY: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "dataset is busy")); zfs_verror(hdl, EZFS_BUSY, fmt, ap); break; case EROFS: zfs_verror(hdl, EZFS_POOLREADONLY, fmt, ap); break; case ENAMETOOLONG: zfs_verror(hdl, EZFS_NAMETOOLONG, fmt, ap); break; case ENOTSUP: zfs_verror(hdl, EZFS_BADVERSION, fmt, ap); break; case EAGAIN: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool I/O is currently suspended")); zfs_verror(hdl, EZFS_POOLUNAVAIL, fmt, ap); break; case EREMOTEIO: zfs_verror(hdl, EZFS_ACTIVE_POOL, fmt, ap); break; case ZFS_ERR_UNKNOWN_SEND_STREAM_FEATURE: case ZFS_ERR_IOC_CMD_UNAVAIL: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "the loaded zfs " "module does not support this operation. A reboot may " "be required to enable this operation.")); zfs_verror(hdl, EZFS_IOC_NOTSUPPORTED, fmt, ap); break; case ZFS_ERR_IOC_ARG_UNAVAIL: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "the loaded zfs " "module does not support an option for this operation. " "A reboot may be required to enable this option.")); zfs_verror(hdl, EZFS_IOC_NOTSUPPORTED, fmt, ap); break; case ZFS_ERR_IOC_ARG_REQUIRED: case ZFS_ERR_IOC_ARG_BADTYPE: zfs_verror(hdl, EZFS_IOC_NOTSUPPORTED, fmt, ap); break; case ZFS_ERR_WRONG_PARENT: zfs_verror(hdl, EZFS_WRONG_PARENT, fmt, ap); break; case ZFS_ERR_BADPROP: zfs_verror(hdl, EZFS_BADPROP, fmt, ap); break; case ZFS_ERR_NOT_USER_NAMESPACE: zfs_verror(hdl, EZFS_NOT_USER_NAMESPACE, fmt, ap); break; default: zfs_error_aux(hdl, "%s", strerror(error)); zfs_verror(hdl, EZFS_UNKNOWN, fmt, ap); break; } va_end(ap); return (-1); } void zfs_setprop_error(libzfs_handle_t *hdl, zfs_prop_t prop, int err, char *errbuf) { switch (err) { case ENOSPC: /* * For quotas and reservations, ENOSPC indicates * something different; setting a quota or reservation * doesn't use any disk space. */ switch (prop) { case ZFS_PROP_QUOTA: case ZFS_PROP_REFQUOTA: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "size is less than current used or " "reserved space")); (void) zfs_error(hdl, EZFS_PROPSPACE, errbuf); break; case ZFS_PROP_RESERVATION: case ZFS_PROP_REFRESERVATION: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "size is greater than available space")); (void) zfs_error(hdl, EZFS_PROPSPACE, errbuf); break; default: (void) zfs_standard_error(hdl, err, errbuf); break; } break; case EBUSY: (void) zfs_standard_error(hdl, EBUSY, errbuf); break; case EROFS: (void) zfs_error(hdl, EZFS_DSREADONLY, errbuf); break; case E2BIG: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "property value too long")); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); break; case ENOTSUP: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool and or dataset must be upgraded to set this " "property or value")); (void) zfs_error(hdl, EZFS_BADVERSION, errbuf); break; case ERANGE: if (prop == ZFS_PROP_COMPRESSION || prop == ZFS_PROP_DNODESIZE || prop == ZFS_PROP_RECORDSIZE) { (void) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "property setting is not allowed on " "bootable datasets")); (void) zfs_error(hdl, EZFS_NOTSUP, errbuf); } else if (prop == ZFS_PROP_CHECKSUM || prop == ZFS_PROP_DEDUP) { (void) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "property setting is not allowed on " "root pools")); (void) zfs_error(hdl, EZFS_NOTSUP, errbuf); } else { (void) zfs_standard_error(hdl, err, errbuf); } break; case EINVAL: if (prop == ZPROP_INVAL) { (void) zfs_error(hdl, EZFS_BADPROP, errbuf); } else { (void) zfs_standard_error(hdl, err, errbuf); } break; case ZFS_ERR_BADPROP: (void) zfs_error(hdl, EZFS_BADPROP, errbuf); break; case EACCES: if (prop == ZFS_PROP_KEYLOCATION) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "keylocation may only be set on encryption roots")); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); } else { (void) zfs_standard_error(hdl, err, errbuf); } break; case EOVERFLOW: /* * This platform can't address a volume this big. */ #ifdef _ILP32 if (prop == ZFS_PROP_VOLSIZE) { (void) zfs_error(hdl, EZFS_VOLTOOBIG, errbuf); break; } zfs_fallthrough; #endif default: (void) zfs_standard_error(hdl, err, errbuf); } } int zpool_standard_error(libzfs_handle_t *hdl, int error, const char *msg) { return (zpool_standard_error_fmt(hdl, error, "%s", msg)); } int zpool_standard_error_fmt(libzfs_handle_t *hdl, int error, const char *fmt, ...) { va_list ap; va_start(ap, fmt); if (zfs_common_error(hdl, error, fmt, ap) != 0) { va_end(ap); return (-1); } switch (error) { case ENODEV: zfs_verror(hdl, EZFS_NODEVICE, fmt, ap); break; case ENOENT: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "no such pool or dataset")); zfs_verror(hdl, EZFS_NOENT, fmt, ap); break; case EEXIST: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool already exists")); zfs_verror(hdl, EZFS_EXISTS, fmt, ap); break; case EBUSY: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool is busy")); zfs_verror(hdl, EZFS_BUSY, fmt, ap); break; /* There is no pending operation to cancel */ case ENOTACTIVE: zfs_verror(hdl, EZFS_NO_PENDING, fmt, ap); break; case ENXIO: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "one or more devices is currently unavailable")); zfs_verror(hdl, EZFS_BADDEV, fmt, ap); break; case ENAMETOOLONG: zfs_verror(hdl, EZFS_DEVOVERFLOW, fmt, ap); break; case ENOTSUP: zfs_verror(hdl, EZFS_POOL_NOTSUP, fmt, ap); break; case EINVAL: zfs_verror(hdl, EZFS_POOL_INVALARG, fmt, ap); break; case ENOSPC: case EDQUOT: zfs_verror(hdl, EZFS_NOSPC, fmt, ap); break; case EAGAIN: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool I/O is currently suspended")); zfs_verror(hdl, EZFS_POOLUNAVAIL, fmt, ap); break; case EROFS: zfs_verror(hdl, EZFS_POOLREADONLY, fmt, ap); break; case EDOM: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "block size out of range or does not match")); zfs_verror(hdl, EZFS_BADPROP, fmt, ap); break; case EREMOTEIO: zfs_verror(hdl, EZFS_ACTIVE_POOL, fmt, ap); break; case ZFS_ERR_CHECKPOINT_EXISTS: zfs_verror(hdl, EZFS_CHECKPOINT_EXISTS, fmt, ap); break; case ZFS_ERR_DISCARDING_CHECKPOINT: zfs_verror(hdl, EZFS_DISCARDING_CHECKPOINT, fmt, ap); break; case ZFS_ERR_NO_CHECKPOINT: zfs_verror(hdl, EZFS_NO_CHECKPOINT, fmt, ap); break; case ZFS_ERR_DEVRM_IN_PROGRESS: zfs_verror(hdl, EZFS_DEVRM_IN_PROGRESS, fmt, ap); break; case ZFS_ERR_VDEV_TOO_BIG: zfs_verror(hdl, EZFS_VDEV_TOO_BIG, fmt, ap); break; case ZFS_ERR_EXPORT_IN_PROGRESS: zfs_verror(hdl, EZFS_EXPORT_IN_PROGRESS, fmt, ap); break; case ZFS_ERR_RESILVER_IN_PROGRESS: zfs_verror(hdl, EZFS_RESILVERING, fmt, ap); break; case ZFS_ERR_REBUILD_IN_PROGRESS: zfs_verror(hdl, EZFS_REBUILDING, fmt, ap); break; case ZFS_ERR_BADPROP: zfs_verror(hdl, EZFS_BADPROP, fmt, ap); break; case ZFS_ERR_VDEV_NOTSUP: zfs_verror(hdl, EZFS_VDEV_NOTSUP, fmt, ap); break; case ZFS_ERR_IOC_CMD_UNAVAIL: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "the loaded zfs " "module does not support this operation. A reboot may " "be required to enable this operation.")); zfs_verror(hdl, EZFS_IOC_NOTSUPPORTED, fmt, ap); break; case ZFS_ERR_IOC_ARG_UNAVAIL: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "the loaded zfs " "module does not support an option for this operation. " "A reboot may be required to enable this option.")); zfs_verror(hdl, EZFS_IOC_NOTSUPPORTED, fmt, ap); break; case ZFS_ERR_IOC_ARG_REQUIRED: case ZFS_ERR_IOC_ARG_BADTYPE: zfs_verror(hdl, EZFS_IOC_NOTSUPPORTED, fmt, ap); break; default: zfs_error_aux(hdl, "%s", strerror(error)); zfs_verror(hdl, EZFS_UNKNOWN, fmt, ap); } va_end(ap); return (-1); } /* * Display an out of memory error message and abort the current program. */ int no_memory(libzfs_handle_t *hdl) { return (zfs_error(hdl, EZFS_NOMEM, "internal error")); } /* * A safe form of malloc() which will die if the allocation fails. */ void * zfs_alloc(libzfs_handle_t *hdl, size_t size) { void *data; if ((data = calloc(1, size)) == NULL) (void) no_memory(hdl); return (data); } /* * A safe form of asprintf() which will die if the allocation fails. */ char * zfs_asprintf(libzfs_handle_t *hdl, const char *fmt, ...) { va_list ap; char *ret; int err; va_start(ap, fmt); err = vasprintf(&ret, fmt, ap); va_end(ap); if (err < 0) { (void) no_memory(hdl); ret = NULL; } return (ret); } /* * A safe form of realloc(), which also zeroes newly allocated space. */ void * zfs_realloc(libzfs_handle_t *hdl, void *ptr, size_t oldsize, size_t newsize) { void *ret; if ((ret = realloc(ptr, newsize)) == NULL) { (void) no_memory(hdl); return (NULL); } memset((char *)ret + oldsize, 0, newsize - oldsize); return (ret); } /* * A safe form of strdup() which will die if the allocation fails. */ char * zfs_strdup(libzfs_handle_t *hdl, const char *str) { char *ret; if ((ret = strdup(str)) == NULL) (void) no_memory(hdl); return (ret); } void libzfs_print_on_error(libzfs_handle_t *hdl, boolean_t printerr) { hdl->libzfs_printerr = printerr; } /* * Read lines from an open file descriptor and store them in an array of * strings until EOF. lines[] will be allocated and populated with all the * lines read. All newlines are replaced with NULL terminators for * convenience. lines[] must be freed after use with libzfs_free_str_array(). * * Returns the number of lines read. */ static int libzfs_read_stdout_from_fd(int fd, char **lines[]) { FILE *fp; int lines_cnt = 0; size_t len = 0; char *line = NULL; char **tmp_lines = NULL, **tmp; fp = fdopen(fd, "r"); if (fp == NULL) { close(fd); return (0); } while (getline(&line, &len, fp) != -1) { tmp = realloc(tmp_lines, sizeof (*tmp_lines) * (lines_cnt + 1)); if (tmp == NULL) { /* Return the lines we were able to process */ break; } tmp_lines = tmp; /* Remove newline if not EOF */ if (line[strlen(line) - 1] == '\n') line[strlen(line) - 1] = '\0'; tmp_lines[lines_cnt] = strdup(line); if (tmp_lines[lines_cnt] == NULL) break; ++lines_cnt; } free(line); fclose(fp); *lines = tmp_lines; return (lines_cnt); } static int libzfs_run_process_impl(const char *path, char *argv[], char *env[], int flags, char **lines[], int *lines_cnt) { pid_t pid; int error, devnull_fd; int link[2]; /* * Setup a pipe between our child and parent process if we're * reading stdout. */ if (lines != NULL && pipe2(link, O_NONBLOCK | O_CLOEXEC) == -1) return (-EPIPE); pid = fork(); if (pid == 0) { /* Child process */ devnull_fd = open("/dev/null", O_WRONLY | O_CLOEXEC); if (devnull_fd < 0) _exit(-1); if (!(flags & STDOUT_VERBOSE) && (lines == NULL)) (void) dup2(devnull_fd, STDOUT_FILENO); else if (lines != NULL) { /* Save the output to lines[] */ dup2(link[1], STDOUT_FILENO); } if (!(flags & STDERR_VERBOSE)) (void) dup2(devnull_fd, STDERR_FILENO); if (flags & NO_DEFAULT_PATH) { if (env == NULL) execv(path, argv); else execve(path, argv, env); } else { if (env == NULL) execvp(path, argv); else execvpe(path, argv, env); } _exit(-1); } else if (pid > 0) { /* Parent process */ int status; while ((error = waitpid(pid, &status, 0)) == -1 && errno == EINTR) ; if (error < 0 || !WIFEXITED(status)) return (-1); if (lines != NULL) { close(link[1]); *lines_cnt = libzfs_read_stdout_from_fd(link[0], lines); } return (WEXITSTATUS(status)); } return (-1); } int libzfs_run_process(const char *path, char *argv[], int flags) { return (libzfs_run_process_impl(path, argv, NULL, flags, NULL, NULL)); } /* * Run a command and store its stdout lines in an array of strings (lines[]). * lines[] is allocated and populated for you, and the number of lines is set in * lines_cnt. lines[] must be freed after use with libzfs_free_str_array(). * All newlines (\n) in lines[] are terminated for convenience. */ int libzfs_run_process_get_stdout(const char *path, char *argv[], char *env[], char **lines[], int *lines_cnt) { return (libzfs_run_process_impl(path, argv, env, 0, lines, lines_cnt)); } /* * Same as libzfs_run_process_get_stdout(), but run without $PATH set. This * means that *path needs to be the full path to the executable. */ int libzfs_run_process_get_stdout_nopath(const char *path, char *argv[], char *env[], char **lines[], int *lines_cnt) { return (libzfs_run_process_impl(path, argv, env, NO_DEFAULT_PATH, lines, lines_cnt)); } /* * Free an array of strings. Free both the strings contained in the array and * the array itself. */ void libzfs_free_str_array(char **strs, int count) { while (--count >= 0) free(strs[count]); free(strs); } /* * Returns 1 if environment variable is set to "YES", "yes", "ON", "on", or * a non-zero number. * * Returns 0 otherwise. */ boolean_t libzfs_envvar_is_set(const char *envvar) { char *env = getenv(envvar); return (env && (strtoul(env, NULL, 0) > 0 || (!strncasecmp(env, "YES", 3) && strnlen(env, 4) == 3) || (!strncasecmp(env, "ON", 2) && strnlen(env, 3) == 2))); } libzfs_handle_t * libzfs_init(void) { libzfs_handle_t *hdl; int error; char *env; if ((error = libzfs_load_module()) != 0) { errno = error; return (NULL); } if ((hdl = calloc(1, sizeof (libzfs_handle_t))) == NULL) { return (NULL); } if (regcomp(&hdl->libzfs_urire, URI_REGEX, 0) != 0) { free(hdl); return (NULL); } if ((hdl->libzfs_fd = open(ZFS_DEV, O_RDWR|O_EXCL|O_CLOEXEC)) < 0) { free(hdl); return (NULL); } if (libzfs_core_init() != 0) { (void) close(hdl->libzfs_fd); free(hdl); return (NULL); } zfs_prop_init(); zpool_prop_init(); zpool_feature_init(); vdev_prop_init(); libzfs_mnttab_init(hdl); fletcher_4_init(); if (getenv("ZFS_PROP_DEBUG") != NULL) { hdl->libzfs_prop_debug = B_TRUE; } if ((env = getenv("ZFS_SENDRECV_MAX_NVLIST")) != NULL) { if ((error = zfs_nicestrtonum(hdl, env, &hdl->libzfs_max_nvlist))) { errno = error; (void) close(hdl->libzfs_fd); free(hdl); return (NULL); } } else { hdl->libzfs_max_nvlist = (SPA_MAXBLOCKSIZE * 4); } /* * For testing, remove some settable properties and features */ if (libzfs_envvar_is_set("ZFS_SYSFS_PROP_SUPPORT_TEST")) { zprop_desc_t *proptbl; proptbl = zpool_prop_get_table(); proptbl[ZPOOL_PROP_COMMENT].pd_zfs_mod_supported = B_FALSE; proptbl = zfs_prop_get_table(); proptbl[ZFS_PROP_DNODESIZE].pd_zfs_mod_supported = B_FALSE; zfeature_info_t *ftbl = spa_feature_table; ftbl[SPA_FEATURE_LARGE_BLOCKS].fi_zfs_mod_supported = B_FALSE; } return (hdl); } void libzfs_fini(libzfs_handle_t *hdl) { (void) close(hdl->libzfs_fd); zpool_free_handles(hdl); namespace_clear(hdl); libzfs_mnttab_fini(hdl); libzfs_core_fini(); regfree(&hdl->libzfs_urire); fletcher_4_fini(); #if LIBFETCH_DYNAMIC if (hdl->libfetch != (void *)-1 && hdl->libfetch != NULL) (void) dlclose(hdl->libfetch); free(hdl->libfetch_load_error); #endif free(hdl); } libzfs_handle_t * zpool_get_handle(zpool_handle_t *zhp) { return (zhp->zpool_hdl); } libzfs_handle_t * zfs_get_handle(zfs_handle_t *zhp) { return (zhp->zfs_hdl); } zpool_handle_t * zfs_get_pool_handle(const zfs_handle_t *zhp) { return (zhp->zpool_hdl); } /* * Given a name, determine whether or not it's a valid path * (starts with '/' or "./"). If so, walk the mnttab trying * to match the device number. If not, treat the path as an * fs/vol/snap/bkmark name. */ zfs_handle_t * zfs_path_to_zhandle(libzfs_handle_t *hdl, const char *path, zfs_type_t argtype) { struct stat64 statbuf; struct extmnttab entry; if (path[0] != '/' && strncmp(path, "./", strlen("./")) != 0) { /* * It's not a valid path, assume it's a name of type 'argtype'. */ return (zfs_open(hdl, path, argtype)); } if (getextmntent(path, &entry, &statbuf) != 0) return (NULL); if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0) { (void) fprintf(stderr, gettext("'%s': not a ZFS filesystem\n"), path); return (NULL); } return (zfs_open(hdl, entry.mnt_special, ZFS_TYPE_FILESYSTEM)); } /* * Initialize the zc_nvlist_dst member to prepare for receiving an nvlist from * an ioctl(). */ void zcmd_alloc_dst_nvlist(libzfs_handle_t *hdl, zfs_cmd_t *zc, size_t len) { if (len == 0) len = 256 * 1024; zc->zc_nvlist_dst_size = len; zc->zc_nvlist_dst = (uint64_t)(uintptr_t)zfs_alloc(hdl, zc->zc_nvlist_dst_size); } /* * Called when an ioctl() which returns an nvlist fails with ENOMEM. This will * expand the nvlist to the size specified in 'zc_nvlist_dst_size', which was * filled in by the kernel to indicate the actual required size. */ void zcmd_expand_dst_nvlist(libzfs_handle_t *hdl, zfs_cmd_t *zc) { free((void *)(uintptr_t)zc->zc_nvlist_dst); zc->zc_nvlist_dst = (uint64_t)(uintptr_t)zfs_alloc(hdl, zc->zc_nvlist_dst_size); } /* * Called to free the src and dst nvlists stored in the command structure. */ void zcmd_free_nvlists(zfs_cmd_t *zc) { free((void *)(uintptr_t)zc->zc_nvlist_conf); free((void *)(uintptr_t)zc->zc_nvlist_src); free((void *)(uintptr_t)zc->zc_nvlist_dst); zc->zc_nvlist_conf = 0; zc->zc_nvlist_src = 0; zc->zc_nvlist_dst = 0; } static void zcmd_write_nvlist_com(libzfs_handle_t *hdl, uint64_t *outnv, uint64_t *outlen, nvlist_t *nvl) { char *packed; size_t len = fnvlist_size(nvl); packed = zfs_alloc(hdl, len); verify(nvlist_pack(nvl, &packed, &len, NV_ENCODE_NATIVE, 0) == 0); *outnv = (uint64_t)(uintptr_t)packed; *outlen = len; } void zcmd_write_conf_nvlist(libzfs_handle_t *hdl, zfs_cmd_t *zc, nvlist_t *nvl) { zcmd_write_nvlist_com(hdl, &zc->zc_nvlist_conf, &zc->zc_nvlist_conf_size, nvl); } void zcmd_write_src_nvlist(libzfs_handle_t *hdl, zfs_cmd_t *zc, nvlist_t *nvl) { zcmd_write_nvlist_com(hdl, &zc->zc_nvlist_src, &zc->zc_nvlist_src_size, nvl); } /* * Unpacks an nvlist from the ZFS ioctl command structure. */ int zcmd_read_dst_nvlist(libzfs_handle_t *hdl, zfs_cmd_t *zc, nvlist_t **nvlp) { if (nvlist_unpack((void *)(uintptr_t)zc->zc_nvlist_dst, zc->zc_nvlist_dst_size, nvlp, 0) != 0) return (no_memory(hdl)); return (0); } /* * ================================================================ * API shared by zfs and zpool property management * ================================================================ */ static void zprop_print_headers(zprop_get_cbdata_t *cbp, zfs_type_t type) { - zprop_list_t *pl = cbp->cb_proplist; + zprop_list_t *pl; int i; char *title; size_t len; cbp->cb_first = B_FALSE; if (cbp->cb_scripted) return; /* * Start with the length of the column headers. */ cbp->cb_colwidths[GET_COL_NAME] = strlen(dgettext(TEXT_DOMAIN, "NAME")); cbp->cb_colwidths[GET_COL_PROPERTY] = strlen(dgettext(TEXT_DOMAIN, "PROPERTY")); cbp->cb_colwidths[GET_COL_VALUE] = strlen(dgettext(TEXT_DOMAIN, "VALUE")); cbp->cb_colwidths[GET_COL_RECVD] = strlen(dgettext(TEXT_DOMAIN, "RECEIVED")); cbp->cb_colwidths[GET_COL_SOURCE] = strlen(dgettext(TEXT_DOMAIN, "SOURCE")); /* first property is always NAME */ assert(cbp->cb_proplist->pl_prop == ((type == ZFS_TYPE_POOL) ? ZPOOL_PROP_NAME : ((type == ZFS_TYPE_VDEV) ? VDEV_PROP_NAME : ZFS_PROP_NAME))); /* * Go through and calculate the widths for each column. For the * 'source' column, we kludge it up by taking the worst-case scenario of * inheriting from the longest name. This is acceptable because in the * majority of cases 'SOURCE' is the last column displayed, and we don't * use the width anyway. Note that the 'VALUE' column can be oversized, * if the name of the property is much longer than any values we find. */ for (pl = cbp->cb_proplist; pl != NULL; pl = pl->pl_next) { /* * 'PROPERTY' column */ if (pl->pl_prop != ZPROP_USERPROP) { const char *propname = (type == ZFS_TYPE_POOL) ? zpool_prop_to_name(pl->pl_prop) : ((type == ZFS_TYPE_VDEV) ? vdev_prop_to_name(pl->pl_prop) : zfs_prop_to_name(pl->pl_prop)); assert(propname != NULL); len = strlen(propname); if (len > cbp->cb_colwidths[GET_COL_PROPERTY]) cbp->cb_colwidths[GET_COL_PROPERTY] = len; } else { assert(pl->pl_user_prop != NULL); len = strlen(pl->pl_user_prop); if (len > cbp->cb_colwidths[GET_COL_PROPERTY]) cbp->cb_colwidths[GET_COL_PROPERTY] = len; } /* * 'VALUE' column. The first property is always the 'name' * property that was tacked on either by /sbin/zfs's * zfs_do_get() or when calling zprop_expand_list(), so we * ignore its width. If the user specified the name property * to display, then it will be later in the list in any case. */ if (pl != cbp->cb_proplist && pl->pl_width > cbp->cb_colwidths[GET_COL_VALUE]) cbp->cb_colwidths[GET_COL_VALUE] = pl->pl_width; /* 'RECEIVED' column. */ if (pl != cbp->cb_proplist && pl->pl_recvd_width > cbp->cb_colwidths[GET_COL_RECVD]) cbp->cb_colwidths[GET_COL_RECVD] = pl->pl_recvd_width; /* * 'NAME' and 'SOURCE' columns */ if (pl->pl_prop == ((type == ZFS_TYPE_POOL) ? ZPOOL_PROP_NAME : ((type == ZFS_TYPE_VDEV) ? VDEV_PROP_NAME : ZFS_PROP_NAME)) && pl->pl_width > cbp->cb_colwidths[GET_COL_NAME]) { cbp->cb_colwidths[GET_COL_NAME] = pl->pl_width; cbp->cb_colwidths[GET_COL_SOURCE] = pl->pl_width + strlen(dgettext(TEXT_DOMAIN, "inherited from")); } } /* * Now go through and print the headers. */ for (i = 0; i < ZFS_GET_NCOLS; i++) { switch (cbp->cb_columns[i]) { case GET_COL_NAME: title = dgettext(TEXT_DOMAIN, "NAME"); break; case GET_COL_PROPERTY: title = dgettext(TEXT_DOMAIN, "PROPERTY"); break; case GET_COL_VALUE: title = dgettext(TEXT_DOMAIN, "VALUE"); break; case GET_COL_RECVD: title = dgettext(TEXT_DOMAIN, "RECEIVED"); break; case GET_COL_SOURCE: title = dgettext(TEXT_DOMAIN, "SOURCE"); break; default: title = NULL; } if (title != NULL) { if (i == (ZFS_GET_NCOLS - 1) || cbp->cb_columns[i + 1] == GET_COL_NONE) (void) printf("%s", title); else (void) printf("%-*s ", cbp->cb_colwidths[cbp->cb_columns[i]], title); } } (void) printf("\n"); } /* * Display a single line of output, according to the settings in the callback * structure. */ void zprop_print_one_property(const char *name, zprop_get_cbdata_t *cbp, const char *propname, const char *value, zprop_source_t sourcetype, const char *source, const char *recvd_value) { int i; const char *str = NULL; char buf[128]; /* * Ignore those source types that the user has chosen to ignore. */ if ((sourcetype & cbp->cb_sources) == 0) return; if (cbp->cb_first) zprop_print_headers(cbp, cbp->cb_type); for (i = 0; i < ZFS_GET_NCOLS; i++) { switch (cbp->cb_columns[i]) { case GET_COL_NAME: str = name; break; case GET_COL_PROPERTY: str = propname; break; case GET_COL_VALUE: str = value; break; case GET_COL_SOURCE: switch (sourcetype) { case ZPROP_SRC_NONE: str = "-"; break; case ZPROP_SRC_DEFAULT: str = "default"; break; case ZPROP_SRC_LOCAL: str = "local"; break; case ZPROP_SRC_TEMPORARY: str = "temporary"; break; case ZPROP_SRC_INHERITED: (void) snprintf(buf, sizeof (buf), "inherited from %s", source); str = buf; break; case ZPROP_SRC_RECEIVED: str = "received"; break; default: str = NULL; assert(!"unhandled zprop_source_t"); } break; case GET_COL_RECVD: str = (recvd_value == NULL ? "-" : recvd_value); break; default: continue; } if (i == (ZFS_GET_NCOLS - 1) || cbp->cb_columns[i + 1] == GET_COL_NONE) (void) printf("%s", str); else if (cbp->cb_scripted) (void) printf("%s\t", str); else (void) printf("%-*s ", cbp->cb_colwidths[cbp->cb_columns[i]], str); } (void) printf("\n"); } /* * Given a numeric suffix, convert the value into a number of bits that the * resulting value must be shifted. */ static int str2shift(libzfs_handle_t *hdl, const char *buf) { const char *ends = "BKMGTPEZ"; int i; if (buf[0] == '\0') return (0); for (i = 0; i < strlen(ends); i++) { if (toupper(buf[0]) == ends[i]) break; } if (i == strlen(ends)) { if (hdl) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid numeric suffix '%s'"), buf); return (-1); } /* * Allow 'G' = 'GB' = 'GiB', case-insensitively. * However, 'BB' and 'BiB' are disallowed. */ if (buf[1] == '\0' || (toupper(buf[0]) != 'B' && ((toupper(buf[1]) == 'B' && buf[2] == '\0') || (toupper(buf[1]) == 'I' && toupper(buf[2]) == 'B' && buf[3] == '\0')))) return (10 * i); if (hdl) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid numeric suffix '%s'"), buf); return (-1); } /* * Convert a string of the form '100G' into a real number. Used when setting * properties or creating a volume. 'buf' is used to place an extended error * message for the caller to use. */ int zfs_nicestrtonum(libzfs_handle_t *hdl, const char *value, uint64_t *num) { char *end; int shift; *num = 0; /* Check to see if this looks like a number. */ if ((value[0] < '0' || value[0] > '9') && value[0] != '.') { if (hdl) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "bad numeric value '%s'"), value); return (-1); } /* Rely on strtoull() to process the numeric portion. */ errno = 0; *num = strtoull(value, &end, 10); /* * Check for ERANGE, which indicates that the value is too large to fit * in a 64-bit value. */ if (errno == ERANGE) { if (hdl) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "numeric value is too large")); return (-1); } /* * If we have a decimal value, then do the computation with floating * point arithmetic. Otherwise, use standard arithmetic. */ if (*end == '.') { double fval = strtod(value, &end); if ((shift = str2shift(hdl, end)) == -1) return (-1); fval *= pow(2, shift); /* * UINT64_MAX is not exactly representable as a double. * The closest representation is UINT64_MAX + 1, so we * use a >= comparison instead of > for the bounds check. */ if (fval >= (double)UINT64_MAX) { if (hdl) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "numeric value is too large")); return (-1); } *num = (uint64_t)fval; } else { if ((shift = str2shift(hdl, end)) == -1) return (-1); /* Check for overflow */ if (shift >= 64 || (*num << shift) >> shift != *num) { if (hdl) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "numeric value is too large")); return (-1); } *num <<= shift; } return (0); } /* * Given a propname=value nvpair to set, parse any numeric properties * (index, boolean, etc) if they are specified as strings and add the * resulting nvpair to the returned nvlist. * * At the DSL layer, all properties are either 64-bit numbers or strings. * We want the user to be able to ignore this fact and specify properties * as native values (numbers, for example) or as strings (to simplify * command line utilities). This also handles converting index types * (compression, checksum, etc) from strings to their on-disk index. */ int zprop_parse_value(libzfs_handle_t *hdl, nvpair_t *elem, int prop, zfs_type_t type, nvlist_t *ret, char **svalp, uint64_t *ivalp, const char *errbuf) { data_type_t datatype = nvpair_type(elem); zprop_type_t proptype; const char *propname; char *value; boolean_t isnone = B_FALSE; boolean_t isauto = B_FALSE; int err = 0; if (type == ZFS_TYPE_POOL) { proptype = zpool_prop_get_type(prop); propname = zpool_prop_to_name(prop); } else if (type == ZFS_TYPE_VDEV) { proptype = vdev_prop_get_type(prop); propname = vdev_prop_to_name(prop); } else { proptype = zfs_prop_get_type(prop); propname = zfs_prop_to_name(prop); } /* * Convert any properties to the internal DSL value types. */ *svalp = NULL; *ivalp = 0; switch (proptype) { case PROP_TYPE_STRING: if (datatype != DATA_TYPE_STRING) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' must be a string"), nvpair_name(elem)); goto error; } err = nvpair_value_string(elem, svalp); if (err != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' is invalid"), nvpair_name(elem)); goto error; } if (strlen(*svalp) >= ZFS_MAXPROPLEN) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' is too long"), nvpair_name(elem)); goto error; } break; case PROP_TYPE_NUMBER: if (datatype == DATA_TYPE_STRING) { (void) nvpair_value_string(elem, &value); if (strcmp(value, "none") == 0) { isnone = B_TRUE; } else if (strcmp(value, "auto") == 0) { isauto = B_TRUE; } else if (zfs_nicestrtonum(hdl, value, ivalp) != 0) { goto error; } } else if (datatype == DATA_TYPE_UINT64) { (void) nvpair_value_uint64(elem, ivalp); } else { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' must be a number"), nvpair_name(elem)); goto error; } /* * Quota special: force 'none' and don't allow 0. */ if ((type & ZFS_TYPE_DATASET) && *ivalp == 0 && !isnone && (prop == ZFS_PROP_QUOTA || prop == ZFS_PROP_REFQUOTA)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "use 'none' to disable quota/refquota")); goto error; } /* * Special handling for "*_limit=none". In this case it's not * 0 but UINT64_MAX. */ if ((type & ZFS_TYPE_DATASET) && isnone && (prop == ZFS_PROP_FILESYSTEM_LIMIT || prop == ZFS_PROP_SNAPSHOT_LIMIT)) { *ivalp = UINT64_MAX; } /* * Special handling for setting 'refreservation' to 'auto'. Use * UINT64_MAX to tell the caller to use zfs_fix_auto_resv(). * 'auto' is only allowed on volumes. */ if (isauto) { switch (prop) { case ZFS_PROP_REFRESERVATION: if ((type & ZFS_TYPE_VOLUME) == 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s=auto' only allowed on " "volumes"), nvpair_name(elem)); goto error; } *ivalp = UINT64_MAX; break; default: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'auto' is invalid value for '%s'"), nvpair_name(elem)); goto error; } } break; case PROP_TYPE_INDEX: if (datatype != DATA_TYPE_STRING) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' must be a string"), nvpair_name(elem)); goto error; } (void) nvpair_value_string(elem, &value); if (zprop_string_to_index(prop, value, ivalp, type) != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' must be one of '%s'"), propname, zprop_values(prop, type)); goto error; } break; default: abort(); } /* * Add the result to our return set of properties. */ if (*svalp != NULL) { if (nvlist_add_string(ret, propname, *svalp) != 0) { (void) no_memory(hdl); return (-1); } } else { if (nvlist_add_uint64(ret, propname, *ivalp) != 0) { (void) no_memory(hdl); return (-1); } } return (0); error: (void) zfs_error(hdl, EZFS_BADPROP, errbuf); return (-1); } static int addlist(libzfs_handle_t *hdl, const char *propname, zprop_list_t **listp, zfs_type_t type) { int prop = zprop_name_to_prop(propname, type); if (prop != ZPROP_INVAL && !zprop_valid_for_type(prop, type, B_FALSE)) prop = ZPROP_INVAL; /* * Return failure if no property table entry was found and this isn't * a user-defined property. */ if (prop == ZPROP_USERPROP && ((type == ZFS_TYPE_POOL && !zpool_prop_feature(propname) && !zpool_prop_unsupported(propname)) || ((type == ZFS_TYPE_DATASET) && !zfs_prop_user(propname) && !zfs_prop_userquota(propname) && !zfs_prop_written(propname)) || ((type == ZFS_TYPE_VDEV) && !vdev_prop_user(propname)))) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid property '%s'"), propname); return (zfs_error(hdl, EZFS_BADPROP, dgettext(TEXT_DOMAIN, "bad property list"))); } zprop_list_t *entry = zfs_alloc(hdl, sizeof (*entry)); entry->pl_prop = prop; if (prop == ZPROP_USERPROP) { entry->pl_user_prop = zfs_strdup(hdl, propname); entry->pl_width = strlen(propname); } else { entry->pl_width = zprop_width(prop, &entry->pl_fixed, type); } *listp = entry; return (0); } /* * Given a comma-separated list of properties, construct a property list * containing both user-defined and native properties. This function will * return a NULL list if 'all' is specified, which can later be expanded * by zprop_expand_list(). */ int zprop_get_list(libzfs_handle_t *hdl, char *props, zprop_list_t **listp, zfs_type_t type) { *listp = NULL; /* * If 'all' is specified, return a NULL list. */ if (strcmp(props, "all") == 0) return (0); /* * If no props were specified, return an error. */ if (props[0] == '\0') { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "no properties specified")); return (zfs_error(hdl, EZFS_BADPROP, dgettext(TEXT_DOMAIN, "bad property list"))); } for (char *p; (p = strsep(&props, ",")); ) if (strcmp(p, "space") == 0) { static const char *const spaceprops[] = { "name", "avail", "used", "usedbysnapshots", "usedbydataset", "usedbyrefreservation", "usedbychildren" }; for (int i = 0; i < ARRAY_SIZE(spaceprops); i++) { if (addlist(hdl, spaceprops[i], listp, type)) return (-1); listp = &(*listp)->pl_next; } } else { if (addlist(hdl, p, listp, type)) return (-1); listp = &(*listp)->pl_next; } return (0); } void zprop_free_list(zprop_list_t *pl) { zprop_list_t *next; while (pl != NULL) { next = pl->pl_next; free(pl->pl_user_prop); free(pl); pl = next; } } typedef struct expand_data { zprop_list_t **last; libzfs_handle_t *hdl; zfs_type_t type; } expand_data_t; static int zprop_expand_list_cb(int prop, void *cb) { zprop_list_t *entry; expand_data_t *edp = cb; entry = zfs_alloc(edp->hdl, sizeof (zprop_list_t)); entry->pl_prop = prop; entry->pl_width = zprop_width(prop, &entry->pl_fixed, edp->type); entry->pl_all = B_TRUE; *(edp->last) = entry; edp->last = &entry->pl_next; return (ZPROP_CONT); } int zprop_expand_list(libzfs_handle_t *hdl, zprop_list_t **plp, zfs_type_t type) { zprop_list_t *entry; zprop_list_t **last; expand_data_t exp; if (*plp == NULL) { /* * If this is the very first time we've been called for an 'all' * specification, expand the list to include all native * properties. */ last = plp; exp.last = last; exp.hdl = hdl; exp.type = type; if (zprop_iter_common(zprop_expand_list_cb, &exp, B_FALSE, B_FALSE, type) == ZPROP_INVAL) return (-1); /* * Add 'name' to the beginning of the list, which is handled * specially. */ entry = zfs_alloc(hdl, sizeof (zprop_list_t)); entry->pl_prop = ((type == ZFS_TYPE_POOL) ? ZPOOL_PROP_NAME : ((type == ZFS_TYPE_VDEV) ? VDEV_PROP_NAME : ZFS_PROP_NAME)); entry->pl_width = zprop_width(entry->pl_prop, &entry->pl_fixed, type); entry->pl_all = B_TRUE; entry->pl_next = *plp; *plp = entry; } return (0); } int zprop_iter(zprop_func func, void *cb, boolean_t show_all, boolean_t ordered, zfs_type_t type) { return (zprop_iter_common(func, cb, show_all, ordered, type)); } const char * zfs_version_userland(void) { return (ZFS_META_ALIAS); } /* * Prints both zfs userland and kernel versions * Returns 0 on success, and -1 on error */ int zfs_version_print(void) { (void) puts(ZFS_META_ALIAS); char *kver = zfs_version_kernel(); if (kver == NULL) { fprintf(stderr, "zfs_version_kernel() failed: %s\n", strerror(errno)); return (-1); } (void) printf("zfs-kmod-%s\n", kver); free(kver); return (0); } /* * Return 1 if the user requested ANSI color output, and our terminal supports * it. Return 0 for no color. */ static int use_color(void) { static int use_color = -1; char *term; /* * Optimization: * * For each zpool invocation, we do a single check to see if we should * be using color or not, and cache that value for the lifetime of the * the zpool command. That makes it cheap to call use_color() when * we're printing with color. We assume that the settings are not going * to change during the invocation of a zpool command (the user isn't * going to change the ZFS_COLOR value while zpool is running, for * example). */ if (use_color != -1) { /* * We've already figured out if we should be using color or * not. Return the cached value. */ return (use_color); } term = getenv("TERM"); /* * The user sets the ZFS_COLOR env var set to enable zpool ANSI color * output. However if NO_COLOR is set (https://no-color.org/) then * don't use it. Also, don't use color if terminal doesn't support * it. */ if (libzfs_envvar_is_set("ZFS_COLOR") && !libzfs_envvar_is_set("NO_COLOR") && isatty(STDOUT_FILENO) && term && strcmp("dumb", term) != 0 && strcmp("unknown", term) != 0) { /* Color supported */ use_color = 1; } else { use_color = 0; } return (use_color); } /* * color_start() and color_end() are used for when you want to colorize a block * of text. For example: * * color_start(ANSI_RED_FG) * printf("hello"); * printf("world"); * color_end(); */ void color_start(const char *color) { if (use_color()) fputs(color, stdout); } void color_end(void) { if (use_color()) fputs(ANSI_RESET, stdout); } /* printf() with a color. If color is NULL, then do a normal printf. */ int printf_color(const char *color, const char *format, ...) { va_list aptr; int rc; if (color) color_start(color); va_start(aptr, format); rc = vprintf(format, aptr); va_end(aptr); if (color) color_end(); return (rc); } diff --git a/lib/libzutil/os/linux/zutil_device_path_os.c b/lib/libzutil/os/linux/zutil_device_path_os.c index 05dbb39954fa..900d5e5bacd2 100644 --- a/lib/libzutil/os/linux/zutil_device_path_os.c +++ b/lib/libzutil/os/linux/zutil_device_path_os.c @@ -1,692 +1,690 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #include #include #include #include #include #include #include #ifdef HAVE_LIBUDEV #include #endif #include /* * Append partition suffix to an otherwise fully qualified device path. * This is used to generate the name the full path as its stored in * ZPOOL_CONFIG_PATH for whole disk devices. On success the new length * of 'path' will be returned on error a negative value is returned. */ int zfs_append_partition(char *path, size_t max_len) { int len = strlen(path); if ((strncmp(path, UDISK_ROOT, strlen(UDISK_ROOT)) == 0) || (strncmp(path, ZVOL_ROOT, strlen(ZVOL_ROOT)) == 0)) { if (len + 6 >= max_len) return (-1); (void) strcat(path, "-part1"); len += 6; } else { if (len + 2 >= max_len) return (-1); if (isdigit(path[len-1])) { (void) strcat(path, "p1"); len += 2; } else { (void) strcat(path, "1"); len += 1; } } return (len); } /* * Remove partition suffix from a vdev path. Partition suffixes may take three * forms: "-partX", "pX", or "X", where X is a string of digits. The second * case only occurs when the suffix is preceded by a digit, i.e. "md0p0" The * third case only occurs when preceded by a string matching the regular * expression "^([hsv]|xv)d[a-z]+", i.e. a scsi, ide, virtio or xen disk. * * caller must free the returned string */ char * zfs_strip_partition(const char *path) { char *tmp = strdup(path); char *part = NULL, *d = NULL; if (!tmp) return (NULL); if ((part = strstr(tmp, "-part")) && part != tmp) { d = part + 5; } else if ((part = strrchr(tmp, 'p')) && part > tmp + 1 && isdigit(*(part-1))) { d = part + 1; } else if ((tmp[0] == 'h' || tmp[0] == 's' || tmp[0] == 'v') && tmp[1] == 'd') { for (d = &tmp[2]; isalpha(*d); part = ++d) { } } else if (strncmp("xvd", tmp, 3) == 0) { for (d = &tmp[3]; isalpha(*d); part = ++d) { } } if (part && d && *d != '\0') { for (; isdigit(*d); d++) { } if (*d == '\0') *part = '\0'; } return (tmp); } /* * Same as zfs_strip_partition, but allows "/dev/" to be in the pathname * * path: /dev/sda1 * returns: /dev/sda * * Returned string must be freed. */ static char * zfs_strip_partition_path(const char *path) { char *newpath = strdup(path); char *sd_offset; char *new_sd; if (!newpath) return (NULL); /* Point to "sda1" part of "/dev/sda1" */ sd_offset = strrchr(newpath, '/') + 1; /* Get our new name "sda" */ new_sd = zfs_strip_partition(sd_offset); if (!new_sd) { free(newpath); return (NULL); } /* Paste the "sda" where "sda1" was */ strlcpy(sd_offset, new_sd, strlen(sd_offset) + 1); /* Free temporary "sda" */ free(new_sd); return (newpath); } /* * Strip the unwanted portion of a device path. */ const char * zfs_strip_path(const char *path) { size_t spath_count; const char *const *spaths = zpool_default_search_paths(&spath_count); for (size_t i = 0; i < spath_count; ++i) if (strncmp(path, spaths[i], strlen(spaths[i])) == 0 && path[strlen(spaths[i])] == '/') return (path + strlen(spaths[i]) + 1); return (path); } /* * Read the contents of a sysfs file into an allocated buffer and remove the * last newline. * * This is useful for reading sysfs files that return a single string. Return * an allocated string pointer on success, NULL otherwise. Returned buffer * must be freed by the user. */ static char * zfs_read_sysfs_file(char *filepath) { char buf[4096]; /* all sysfs files report 4k size */ char *str = NULL; FILE *fp = fopen(filepath, "r"); if (fp == NULL) { return (NULL); } if (fgets(buf, sizeof (buf), fp) == buf) { /* success */ /* Remove the last newline (if any) */ size_t len = strlen(buf); if (buf[len - 1] == '\n') { buf[len - 1] = '\0'; } str = strdup(buf); } fclose(fp); return (str); } /* * Given a dev name like "nvme0n1", return the full PCI slot sysfs path to * the drive (in /sys/bus/pci/slots). * * For example: * dev: "nvme0n1" * returns: "/sys/bus/pci/slots/0" * * 'dev' must be an NVMe device. * * Returned string must be freed. Returns NULL on error or no sysfs path. */ static char * zfs_get_pci_slots_sys_path(const char *dev_name) { DIR *dp = NULL; struct dirent *ep; char *address1 = NULL; char *address2 = NULL; char *path = NULL; char buf[MAXPATHLEN]; char *tmp; /* If they preface 'dev' with a path (like "/dev") then strip it off */ tmp = strrchr(dev_name, '/'); if (tmp != NULL) dev_name = tmp + 1; /* +1 since we want the chr after '/' */ if (strncmp("nvme", dev_name, 4) != 0) return (NULL); (void) snprintf(buf, sizeof (buf), "/sys/block/%s/device/address", dev_name); address1 = zfs_read_sysfs_file(buf); if (!address1) return (NULL); /* * /sys/block/nvme0n1/device/address format will * be "0000:01:00.0" while /sys/bus/pci/slots/0/address will be * "0000:01:00". Just NULL terminate at the '.' so they match. */ tmp = strrchr(address1, '.'); if (tmp != NULL) *tmp = '\0'; dp = opendir("/sys/bus/pci/slots/"); if (dp == NULL) { free(address1); return (NULL); } /* * Look through all the /sys/bus/pci/slots/ subdirs */ while ((ep = readdir(dp))) { /* * We only care about directory names that are a single number. * Sometimes there's other directories like * "/sys/bus/pci/slots/0-3/" in there - skip those. */ if (!zfs_isnumber(ep->d_name)) continue; (void) snprintf(buf, sizeof (buf), "/sys/bus/pci/slots/%s/address", ep->d_name); address2 = zfs_read_sysfs_file(buf); if (!address2) continue; if (strcmp(address1, address2) == 0) { /* Addresses match, we're all done */ free(address2); if (asprintf(&path, "/sys/bus/pci/slots/%s", ep->d_name) == -1) { continue; } break; } free(address2); } closedir(dp); free(address1); return (path); } /* * Given a dev name like "sda", return the full enclosure sysfs path to * the disk. You can also pass in the name with "/dev" prepended * to it (like /dev/sda). This works for both JBODs and NVMe PCI devices. * * For example, disk "sda" in enclosure slot 1: * dev_name: "sda" * returns: "/sys/class/enclosure/1:0:3:0/Slot 1" * * Or: * * dev_name: "nvme0n1" * returns: "/sys/bus/pci/slots/0" * * 'dev' must be a non-devicemapper device. * * Returned string must be freed. Returns NULL on error. */ char * zfs_get_enclosure_sysfs_path(const char *dev_name) { DIR *dp = NULL; struct dirent *ep; char buf[MAXPATHLEN]; char *tmp1 = NULL; char *tmp2 = NULL; char *tmp3 = NULL; char *path = NULL; size_t size; int tmpsize; if (dev_name == NULL) return (NULL); /* If they preface 'dev' with a path (like "/dev") then strip it off */ tmp1 = strrchr(dev_name, '/'); if (tmp1 != NULL) dev_name = tmp1 + 1; /* +1 since we want the chr after '/' */ tmpsize = asprintf(&tmp1, "/sys/block/%s/device", dev_name); if (tmpsize == -1 || tmp1 == NULL) { tmp1 = NULL; goto end; } dp = opendir(tmp1); if (dp == NULL) goto end; /* * Look though all sysfs entries in /sys/block//device for * the enclosure symlink. */ while ((ep = readdir(dp))) { /* Ignore everything that's not our enclosure_device link */ if (strstr(ep->d_name, "enclosure_device") == NULL) continue; if (tmp2 != NULL) free(tmp2); if (asprintf(&tmp2, "%s/%s", tmp1, ep->d_name) == -1) { tmp2 = NULL; break; } size = readlink(tmp2, buf, sizeof (buf)); /* Did readlink fail or crop the link name? */ if (size == -1 || size >= sizeof (buf)) break; /* * We got a valid link. readlink() doesn't terminate strings * so we have to do it. */ buf[size] = '\0'; /* * Our link will look like: * * "../../../../port-11:1:2/..STUFF../enclosure/1:0:3:0/SLOT 1" * * We want to grab the "enclosure/1:0:3:0/SLOT 1" part */ tmp3 = strstr(buf, "enclosure"); if (tmp3 == NULL) break; if (path != NULL) free(path); if (asprintf(&path, "/sys/class/%s", tmp3) == -1) { /* If asprintf() fails, 'path' is undefined */ path = NULL; break; } } end: free(tmp2); free(tmp1); if (dp != NULL) closedir(dp); if (!path) { /* * This particular disk isn't in a JBOD. It could be an NVMe * drive. If so, look up the NVMe device's path in * /sys/bus/pci/slots/. Within that directory is a 'attention' * file which controls the NVMe fault LED. */ path = zfs_get_pci_slots_sys_path(dev_name); } return (path); } /* * Allocate and return the underlying device name for a device mapper device. * * For example, dm_name = "/dev/dm-0" could return "/dev/sda". Symlinks to a * DM device (like /dev/disk/by-vdev/A0) are also allowed. * * If the DM device has multiple underlying devices (like with multipath * DM devices), then favor underlying devices that have a symlink back to their * back to their enclosure device in sysfs. This will be useful for the * zedlet scripts that toggle the fault LED. * * Returns an underlying device name, or NULL on error or no match. If dm_name * is not a DM device then return NULL. * * NOTE: The returned name string must be *freed*. */ static char * dm_get_underlying_path(const char *dm_name) { DIR *dp = NULL; struct dirent *ep; char *realp; char *tmp = NULL; char *path = NULL; char *dev_str; - int size; char *first_path = NULL; char *enclosure_path; if (dm_name == NULL) return (NULL); /* dm name may be a symlink (like /dev/disk/by-vdev/A0) */ realp = realpath(dm_name, NULL); if (realp == NULL) return (NULL); /* * If they preface 'dev' with a path (like "/dev") then strip it off. * We just want the 'dm-N' part. */ tmp = strrchr(realp, '/'); if (tmp != NULL) dev_str = tmp + 1; /* +1 since we want the chr after '/' */ else dev_str = tmp; - if ((size = asprintf(&tmp, "/sys/block/%s/slaves/", dev_str)) == -1) { + if (asprintf(&tmp, "/sys/block/%s/slaves/", dev_str) == -1) { tmp = NULL; goto end; } dp = opendir(tmp); if (dp == NULL) goto end; /* * A device-mapper device can have multiple paths to it (multipath). * Favor paths that have a symlink back to their enclosure device. * We have to do this since some enclosures may only provide a symlink * back for one underlying path to a disk and not the other. * * If no paths have links back to their enclosure, then just return the * first path. */ while ((ep = readdir(dp))) { if (ep->d_type != DT_DIR) { /* skip "." and ".." dirs */ if (!first_path) first_path = strdup(ep->d_name); enclosure_path = zfs_get_enclosure_sysfs_path(ep->d_name); if (!enclosure_path) continue; - if ((size = asprintf( - &path, "/dev/%s", ep->d_name)) == -1) + if (asprintf(&path, "/dev/%s", ep->d_name) == -1) path = NULL; free(enclosure_path); break; } } end: if (dp != NULL) closedir(dp); free(tmp); free(realp); if (!path && first_path) { /* * None of the underlying paths had a link back to their * enclosure devices. Throw up out hands and return the first * underlying path. */ - if ((size = asprintf(&path, "/dev/%s", first_path)) == -1) + if (asprintf(&path, "/dev/%s", first_path) == -1) path = NULL; } free(first_path); return (path); } /* * Return B_TRUE if device is a device mapper or multipath device. * Return B_FALSE if not. */ boolean_t zfs_dev_is_dm(const char *dev_name) { char *tmp; tmp = dm_get_underlying_path(dev_name); if (tmp == NULL) return (B_FALSE); free(tmp); return (B_TRUE); } /* * By "whole disk" we mean an entire physical disk (something we can * label, toggle the write cache on, etc.) as opposed to the full * capacity of a pseudo-device such as lofi or did. We act as if we * are labeling the disk, which should be a pretty good test of whether * it's a viable device or not. Returns B_TRUE if it is and B_FALSE if * it isn't. */ boolean_t zfs_dev_is_whole_disk(const char *dev_name) { struct dk_gpt *label = NULL; int fd; if ((fd = open(dev_name, O_RDONLY | O_DIRECT | O_CLOEXEC)) < 0) return (B_FALSE); if (efi_alloc_and_init(fd, EFI_NUMPAR, &label) != 0) { (void) close(fd); return (B_FALSE); } efi_free(label); (void) close(fd); return (B_TRUE); } /* * Lookup the underlying device for a device name * * Often you'll have a symlink to a device, a partition device, * or a multipath device, and want to look up the underlying device. * This function returns the underlying device name. If the device * name is already the underlying device, then just return the same * name. If the device is a DM device with multiple underlying devices * then return the first one. * * For example: * * 1. /dev/disk/by-id/ata-QEMU_HARDDISK_QM00001 -> ../../sda * dev_name: /dev/disk/by-id/ata-QEMU_HARDDISK_QM00001 * returns: /dev/sda * * 2. /dev/mapper/mpatha (made up of /dev/sda and /dev/sdb) * dev_name: /dev/mapper/mpatha * returns: /dev/sda (first device) * * 3. /dev/sda (already the underlying device) * dev_name: /dev/sda * returns: /dev/sda * * 4. /dev/dm-3 (mapped to /dev/sda) * dev_name: /dev/dm-3 * returns: /dev/sda * * 5. /dev/disk/by-id/scsi-0QEMU_drive-scsi0-0-0-0-part9 -> ../../sdb9 * dev_name: /dev/disk/by-id/scsi-0QEMU_drive-scsi0-0-0-0-part9 * returns: /dev/sdb * * 6. /dev/disk/by-uuid/5df030cf-3cd9-46e4-8e99-3ccb462a4e9a -> ../dev/sda2 * dev_name: /dev/disk/by-uuid/5df030cf-3cd9-46e4-8e99-3ccb462a4e9a * returns: /dev/sda * * Returns underlying device name, or NULL on error or no match. * * NOTE: The returned name string must be *freed*. */ char * zfs_get_underlying_path(const char *dev_name) { char *name = NULL; char *tmp; if (dev_name == NULL) return (NULL); tmp = dm_get_underlying_path(dev_name); /* dev_name not a DM device, so just un-symlinkize it */ if (tmp == NULL) tmp = realpath(dev_name, NULL); if (tmp != NULL) { name = zfs_strip_partition_path(tmp); free(tmp); } return (name); } #ifdef HAVE_LIBUDEV /* * A disk is considered a multipath whole disk when: * DEVNAME key value has "dm-" * DM_UUID key exists and starts with 'mpath-' * ID_PART_TABLE_TYPE key does not exist or is not gpt * ID_FS_LABEL key does not exist (disk isn't labeled) */ static boolean_t is_mpath_udev_sane(struct udev_device *dev) { const char *devname, *type, *uuid, *label; devname = udev_device_get_property_value(dev, "DEVNAME"); type = udev_device_get_property_value(dev, "ID_PART_TABLE_TYPE"); uuid = udev_device_get_property_value(dev, "DM_UUID"); label = udev_device_get_property_value(dev, "ID_FS_LABEL"); if ((devname != NULL && strncmp(devname, "/dev/dm-", 8) == 0) && ((type == NULL) || (strcmp(type, "gpt") != 0)) && ((uuid != NULL) && (strncmp(uuid, "mpath-", 6) == 0)) && (label == NULL)) { return (B_TRUE); } return (B_FALSE); } /* * Check if a disk is a multipath "blank" disk: * * 1. The disk has udev values that suggest it's a multipath disk * 2. The disk is not currently labeled with a filesystem of any type * 3. There are no partitions on the disk */ boolean_t is_mpath_whole_disk(const char *path) { struct udev *udev; struct udev_device *dev = NULL; char nodepath[MAXPATHLEN]; char *sysname; if (realpath(path, nodepath) == NULL) return (B_FALSE); sysname = strrchr(nodepath, '/') + 1; if (strncmp(sysname, "dm-", 3) != 0) return (B_FALSE); if ((udev = udev_new()) == NULL) return (B_FALSE); if ((dev = udev_device_new_from_subsystem_sysname(udev, "block", sysname)) == NULL) { udev_device_unref(dev); return (B_FALSE); } /* Sanity check some udev values */ boolean_t is_sane = is_mpath_udev_sane(dev); udev_device_unref(dev); return (is_sane); } #else /* HAVE_LIBUDEV */ boolean_t is_mpath_whole_disk(const char *path) { (void) path; return (B_FALSE); } #endif /* HAVE_LIBUDEV */ diff --git a/module/icp/algs/blake3/blake3.c b/module/icp/algs/blake3/blake3.c index 5f7018598820..604e05847ee6 100644 --- a/module/icp/algs/blake3/blake3.c +++ b/module/icp/algs/blake3/blake3.c @@ -1,732 +1,730 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3 * Copyright (c) 2019-2020 Samuel Neves and Jack O'Connor * Copyright (c) 2021-2022 Tino Reichardt */ #include #include #include "blake3_impl.h" /* * We need 1056 byte stack for blake3_compress_subtree_wide() * - we define this pragma to make gcc happy */ #if defined(__GNUC__) #pragma GCC diagnostic ignored "-Wframe-larger-than=" #endif /* internal used */ typedef struct { uint32_t input_cv[8]; uint64_t counter; uint8_t block[BLAKE3_BLOCK_LEN]; uint8_t block_len; uint8_t flags; } output_t; /* internal flags */ enum blake3_flags { CHUNK_START = 1 << 0, CHUNK_END = 1 << 1, PARENT = 1 << 2, ROOT = 1 << 3, KEYED_HASH = 1 << 4, DERIVE_KEY_CONTEXT = 1 << 5, DERIVE_KEY_MATERIAL = 1 << 6, }; /* internal start */ static void chunk_state_init(blake3_chunk_state_t *ctx, const uint32_t key[8], uint8_t flags) { memcpy(ctx->cv, key, BLAKE3_KEY_LEN); ctx->chunk_counter = 0; memset(ctx->buf, 0, BLAKE3_BLOCK_LEN); ctx->buf_len = 0; ctx->blocks_compressed = 0; ctx->flags = flags; } static void chunk_state_reset(blake3_chunk_state_t *ctx, const uint32_t key[8], uint64_t chunk_counter) { memcpy(ctx->cv, key, BLAKE3_KEY_LEN); ctx->chunk_counter = chunk_counter; ctx->blocks_compressed = 0; memset(ctx->buf, 0, BLAKE3_BLOCK_LEN); ctx->buf_len = 0; } static size_t chunk_state_len(const blake3_chunk_state_t *ctx) { return (BLAKE3_BLOCK_LEN * (size_t)ctx->blocks_compressed) + ((size_t)ctx->buf_len); } static size_t chunk_state_fill_buf(blake3_chunk_state_t *ctx, const uint8_t *input, size_t input_len) { size_t take = BLAKE3_BLOCK_LEN - ((size_t)ctx->buf_len); if (take > input_len) { take = input_len; } uint8_t *dest = ctx->buf + ((size_t)ctx->buf_len); memcpy(dest, input, take); ctx->buf_len += (uint8_t)take; return (take); } static uint8_t chunk_state_maybe_start_flag(const blake3_chunk_state_t *ctx) { if (ctx->blocks_compressed == 0) { return (CHUNK_START); } else { return (0); } } static output_t make_output(const uint32_t input_cv[8], const uint8_t *block, uint8_t block_len, uint64_t counter, uint8_t flags) { output_t ret; memcpy(ret.input_cv, input_cv, 32); memcpy(ret.block, block, BLAKE3_BLOCK_LEN); ret.block_len = block_len; ret.counter = counter; ret.flags = flags; return (ret); } /* * Chaining values within a given chunk (specifically the compress_in_place * interface) are represented as words. This avoids unnecessary bytes<->words * conversion overhead in the portable implementation. However, the hash_many * interface handles both user input and parent node blocks, so it accepts * bytes. For that reason, chaining values in the CV stack are represented as * bytes. */ static void output_chaining_value(const blake3_ops_t *ops, const output_t *ctx, uint8_t cv[32]) { uint32_t cv_words[8]; memcpy(cv_words, ctx->input_cv, 32); ops->compress_in_place(cv_words, ctx->block, ctx->block_len, ctx->counter, ctx->flags); store_cv_words(cv, cv_words); } static void output_root_bytes(const blake3_ops_t *ops, const output_t *ctx, uint64_t seek, uint8_t *out, size_t out_len) { uint64_t output_block_counter = seek / 64; size_t offset_within_block = seek % 64; uint8_t wide_buf[64]; while (out_len > 0) { ops->compress_xof(ctx->input_cv, ctx->block, ctx->block_len, output_block_counter, ctx->flags | ROOT, wide_buf); size_t available_bytes = 64 - offset_within_block; size_t memcpy_len; if (out_len > available_bytes) { memcpy_len = available_bytes; } else { memcpy_len = out_len; } memcpy(out, wide_buf + offset_within_block, memcpy_len); out += memcpy_len; out_len -= memcpy_len; output_block_counter += 1; offset_within_block = 0; } } static void chunk_state_update(const blake3_ops_t *ops, blake3_chunk_state_t *ctx, const uint8_t *input, size_t input_len) { if (ctx->buf_len > 0) { size_t take = chunk_state_fill_buf(ctx, input, input_len); input += take; input_len -= take; if (input_len > 0) { ops->compress_in_place(ctx->cv, ctx->buf, BLAKE3_BLOCK_LEN, ctx->chunk_counter, ctx->flags|chunk_state_maybe_start_flag(ctx)); ctx->blocks_compressed += 1; ctx->buf_len = 0; memset(ctx->buf, 0, BLAKE3_BLOCK_LEN); } } while (input_len > BLAKE3_BLOCK_LEN) { ops->compress_in_place(ctx->cv, input, BLAKE3_BLOCK_LEN, ctx->chunk_counter, ctx->flags|chunk_state_maybe_start_flag(ctx)); ctx->blocks_compressed += 1; input += BLAKE3_BLOCK_LEN; input_len -= BLAKE3_BLOCK_LEN; } - size_t take = chunk_state_fill_buf(ctx, input, input_len); - input += take; - input_len -= take; + chunk_state_fill_buf(ctx, input, input_len); } static output_t chunk_state_output(const blake3_chunk_state_t *ctx) { uint8_t block_flags = ctx->flags | chunk_state_maybe_start_flag(ctx) | CHUNK_END; return (make_output(ctx->cv, ctx->buf, ctx->buf_len, ctx->chunk_counter, block_flags)); } static output_t parent_output(const uint8_t block[BLAKE3_BLOCK_LEN], const uint32_t key[8], uint8_t flags) { return (make_output(key, block, BLAKE3_BLOCK_LEN, 0, flags | PARENT)); } /* * Given some input larger than one chunk, return the number of bytes that * should go in the left subtree. This is the largest power-of-2 number of * chunks that leaves at least 1 byte for the right subtree. */ static size_t left_len(size_t content_len) { /* * Subtract 1 to reserve at least one byte for the right side. * content_len * should always be greater than BLAKE3_CHUNK_LEN. */ size_t full_chunks = (content_len - 1) / BLAKE3_CHUNK_LEN; return (round_down_to_power_of_2(full_chunks) * BLAKE3_CHUNK_LEN); } /* * Use SIMD parallelism to hash up to MAX_SIMD_DEGREE chunks at the same time * on a single thread. Write out the chunk chaining values and return the * number of chunks hashed. These chunks are never the root and never empty; * those cases use a different codepath. */ static size_t compress_chunks_parallel(const blake3_ops_t *ops, const uint8_t *input, size_t input_len, const uint32_t key[8], uint64_t chunk_counter, uint8_t flags, uint8_t *out) { const uint8_t *chunks_array[MAX_SIMD_DEGREE]; size_t input_position = 0; size_t chunks_array_len = 0; while (input_len - input_position >= BLAKE3_CHUNK_LEN) { chunks_array[chunks_array_len] = &input[input_position]; input_position += BLAKE3_CHUNK_LEN; chunks_array_len += 1; } ops->hash_many(chunks_array, chunks_array_len, BLAKE3_CHUNK_LEN / BLAKE3_BLOCK_LEN, key, chunk_counter, B_TRUE, flags, CHUNK_START, CHUNK_END, out); /* * Hash the remaining partial chunk, if there is one. Note that the * empty chunk (meaning the empty message) is a different codepath. */ if (input_len > input_position) { uint64_t counter = chunk_counter + (uint64_t)chunks_array_len; blake3_chunk_state_t chunk_state; chunk_state_init(&chunk_state, key, flags); chunk_state.chunk_counter = counter; chunk_state_update(ops, &chunk_state, &input[input_position], input_len - input_position); output_t output = chunk_state_output(&chunk_state); output_chaining_value(ops, &output, &out[chunks_array_len * BLAKE3_OUT_LEN]); return (chunks_array_len + 1); } else { return (chunks_array_len); } } /* * Use SIMD parallelism to hash up to MAX_SIMD_DEGREE parents at the same time * on a single thread. Write out the parent chaining values and return the * number of parents hashed. (If there's an odd input chaining value left over, * return it as an additional output.) These parents are never the root and * never empty; those cases use a different codepath. */ static size_t compress_parents_parallel(const blake3_ops_t *ops, const uint8_t *child_chaining_values, size_t num_chaining_values, const uint32_t key[8], uint8_t flags, uint8_t *out) { const uint8_t *parents_array[MAX_SIMD_DEGREE_OR_2]; size_t parents_array_len = 0; while (num_chaining_values - (2 * parents_array_len) >= 2) { parents_array[parents_array_len] = &child_chaining_values[2 * parents_array_len * BLAKE3_OUT_LEN]; parents_array_len += 1; } ops->hash_many(parents_array, parents_array_len, 1, key, 0, B_FALSE, flags | PARENT, 0, 0, out); /* If there's an odd child left over, it becomes an output. */ if (num_chaining_values > 2 * parents_array_len) { memcpy(&out[parents_array_len * BLAKE3_OUT_LEN], &child_chaining_values[2 * parents_array_len * BLAKE3_OUT_LEN], BLAKE3_OUT_LEN); return (parents_array_len + 1); } else { return (parents_array_len); } } /* * The wide helper function returns (writes out) an array of chaining values * and returns the length of that array. The number of chaining values returned * is the dyanmically detected SIMD degree, at most MAX_SIMD_DEGREE. Or fewer, * if the input is shorter than that many chunks. The reason for maintaining a * wide array of chaining values going back up the tree, is to allow the * implementation to hash as many parents in parallel as possible. * * As a special case when the SIMD degree is 1, this function will still return * at least 2 outputs. This guarantees that this function doesn't perform the * root compression. (If it did, it would use the wrong flags, and also we * wouldn't be able to implement exendable ouput.) Note that this function is * not used when the whole input is only 1 chunk long; that's a different * codepath. * * Why not just have the caller split the input on the first update(), instead * of implementing this special rule? Because we don't want to limit SIMD or * multi-threading parallelism for that update(). */ static size_t blake3_compress_subtree_wide(const blake3_ops_t *ops, const uint8_t *input, size_t input_len, const uint32_t key[8], uint64_t chunk_counter, uint8_t flags, uint8_t *out) { /* * Note that the single chunk case does *not* bump the SIMD degree up * to 2 when it is 1. If this implementation adds multi-threading in * the future, this gives us the option of multi-threading even the * 2-chunk case, which can help performance on smaller platforms. */ if (input_len <= (size_t)(ops->degree * BLAKE3_CHUNK_LEN)) { return (compress_chunks_parallel(ops, input, input_len, key, chunk_counter, flags, out)); } /* * With more than simd_degree chunks, we need to recurse. Start by * dividing the input into left and right subtrees. (Note that this is * only optimal as long as the SIMD degree is a power of 2. If we ever * get a SIMD degree of 3 or something, we'll need a more complicated * strategy.) */ size_t left_input_len = left_len(input_len); size_t right_input_len = input_len - left_input_len; const uint8_t *right_input = &input[left_input_len]; uint64_t right_chunk_counter = chunk_counter + (uint64_t)(left_input_len / BLAKE3_CHUNK_LEN); /* * Make space for the child outputs. Here we use MAX_SIMD_DEGREE_OR_2 * to account for the special case of returning 2 outputs when the * SIMD degree is 1. */ uint8_t cv_array[2 * MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN]; size_t degree = ops->degree; if (left_input_len > BLAKE3_CHUNK_LEN && degree == 1) { /* * The special case: We always use a degree of at least two, * to make sure there are two outputs. Except, as noted above, * at the chunk level, where we allow degree=1. (Note that the * 1-chunk-input case is a different codepath.) */ degree = 2; } uint8_t *right_cvs = &cv_array[degree * BLAKE3_OUT_LEN]; /* * Recurse! If this implementation adds multi-threading support in the * future, this is where it will go. */ size_t left_n = blake3_compress_subtree_wide(ops, input, left_input_len, key, chunk_counter, flags, cv_array); size_t right_n = blake3_compress_subtree_wide(ops, right_input, right_input_len, key, right_chunk_counter, flags, right_cvs); /* * The special case again. If simd_degree=1, then we'll have left_n=1 * and right_n=1. Rather than compressing them into a single output, * return them directly, to make sure we always have at least two * outputs. */ if (left_n == 1) { memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN); return (2); } /* Otherwise, do one layer of parent node compression. */ size_t num_chaining_values = left_n + right_n; return compress_parents_parallel(ops, cv_array, num_chaining_values, key, flags, out); } /* * Hash a subtree with compress_subtree_wide(), and then condense the resulting * list of chaining values down to a single parent node. Don't compress that * last parent node, however. Instead, return its message bytes (the * concatenated chaining values of its children). This is necessary when the * first call to update() supplies a complete subtree, because the topmost * parent node of that subtree could end up being the root. It's also necessary * for extended output in the general case. * * As with compress_subtree_wide(), this function is not used on inputs of 1 * chunk or less. That's a different codepath. */ static void compress_subtree_to_parent_node(const blake3_ops_t *ops, const uint8_t *input, size_t input_len, const uint32_t key[8], uint64_t chunk_counter, uint8_t flags, uint8_t out[2 * BLAKE3_OUT_LEN]) { uint8_t cv_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN]; size_t num_cvs = blake3_compress_subtree_wide(ops, input, input_len, key, chunk_counter, flags, cv_array); /* * If MAX_SIMD_DEGREE is greater than 2 and there's enough input, * compress_subtree_wide() returns more than 2 chaining values. Condense * them into 2 by forming parent nodes repeatedly. */ uint8_t out_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN / 2]; while (num_cvs > 2) { num_cvs = compress_parents_parallel(ops, cv_array, num_cvs, key, flags, out_array); memcpy(cv_array, out_array, num_cvs * BLAKE3_OUT_LEN); } memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN); } static void hasher_init_base(BLAKE3_CTX *ctx, const uint32_t key[8], uint8_t flags) { memcpy(ctx->key, key, BLAKE3_KEY_LEN); chunk_state_init(&ctx->chunk, key, flags); ctx->cv_stack_len = 0; ctx->ops = blake3_impl_get_ops(); } /* * As described in hasher_push_cv() below, we do "lazy merging", delaying * merges until right before the next CV is about to be added. This is * different from the reference implementation. Another difference is that we * aren't always merging 1 chunk at a time. Instead, each CV might represent * any power-of-two number of chunks, as long as the smaller-above-larger * stack order is maintained. Instead of the "count the trailing 0-bits" * algorithm described in the spec, we use a "count the total number of * 1-bits" variant that doesn't require us to retain the subtree size of the * CV on top of the stack. The principle is the same: each CV that should * remain in the stack is represented by a 1-bit in the total number of chunks * (or bytes) so far. */ static void hasher_merge_cv_stack(BLAKE3_CTX *ctx, uint64_t total_len) { size_t post_merge_stack_len = (size_t)popcnt(total_len); while (ctx->cv_stack_len > post_merge_stack_len) { uint8_t *parent_node = &ctx->cv_stack[(ctx->cv_stack_len - 2) * BLAKE3_OUT_LEN]; output_t output = parent_output(parent_node, ctx->key, ctx->chunk.flags); output_chaining_value(ctx->ops, &output, parent_node); ctx->cv_stack_len -= 1; } } /* * In reference_impl.rs, we merge the new CV with existing CVs from the stack * before pushing it. We can do that because we know more input is coming, so * we know none of the merges are root. * * This setting is different. We want to feed as much input as possible to * compress_subtree_wide(), without setting aside anything for the chunk_state. * If the user gives us 64 KiB, we want to parallelize over all 64 KiB at once * as a single subtree, if at all possible. * * This leads to two problems: * 1) This 64 KiB input might be the only call that ever gets made to update. * In this case, the root node of the 64 KiB subtree would be the root node * of the whole tree, and it would need to be ROOT finalized. We can't * compress it until we know. * 2) This 64 KiB input might complete a larger tree, whose root node is * similarly going to be the the root of the whole tree. For example, maybe * we have 196 KiB (that is, 128 + 64) hashed so far. We can't compress the * node at the root of the 256 KiB subtree until we know how to finalize it. * * The second problem is solved with "lazy merging". That is, when we're about * to add a CV to the stack, we don't merge it with anything first, as the * reference impl does. Instead we do merges using the *previous* CV that was * added, which is sitting on top of the stack, and we put the new CV * (unmerged) on top of the stack afterwards. This guarantees that we never * merge the root node until finalize(). * * Solving the first problem requires an additional tool, * compress_subtree_to_parent_node(). That function always returns the top * *two* chaining values of the subtree it's compressing. We then do lazy * merging with each of them separately, so that the second CV will always * remain unmerged. (That also helps us support extendable output when we're * hashing an input all-at-once.) */ static void hasher_push_cv(BLAKE3_CTX *ctx, uint8_t new_cv[BLAKE3_OUT_LEN], uint64_t chunk_counter) { hasher_merge_cv_stack(ctx, chunk_counter); memcpy(&ctx->cv_stack[ctx->cv_stack_len * BLAKE3_OUT_LEN], new_cv, BLAKE3_OUT_LEN); ctx->cv_stack_len += 1; } void Blake3_Init(BLAKE3_CTX *ctx) { hasher_init_base(ctx, BLAKE3_IV, 0); } void Blake3_InitKeyed(BLAKE3_CTX *ctx, const uint8_t key[BLAKE3_KEY_LEN]) { uint32_t key_words[8]; load_key_words(key, key_words); hasher_init_base(ctx, key_words, KEYED_HASH); } static void Blake3_Update2(BLAKE3_CTX *ctx, const void *input, size_t input_len) { /* * Explicitly checking for zero avoids causing UB by passing a null * pointer to memcpy. This comes up in practice with things like: * std::vector v; * blake3_hasher_update(&hasher, v.data(), v.size()); */ if (input_len == 0) { return; } const uint8_t *input_bytes = (const uint8_t *)input; /* * If we have some partial chunk bytes in the internal chunk_state, we * need to finish that chunk first. */ if (chunk_state_len(&ctx->chunk) > 0) { size_t take = BLAKE3_CHUNK_LEN - chunk_state_len(&ctx->chunk); if (take > input_len) { take = input_len; } chunk_state_update(ctx->ops, &ctx->chunk, input_bytes, take); input_bytes += take; input_len -= take; /* * If we've filled the current chunk and there's more coming, * finalize this chunk and proceed. In this case we know it's * not the root. */ if (input_len > 0) { output_t output = chunk_state_output(&ctx->chunk); uint8_t chunk_cv[32]; output_chaining_value(ctx->ops, &output, chunk_cv); hasher_push_cv(ctx, chunk_cv, ctx->chunk.chunk_counter); chunk_state_reset(&ctx->chunk, ctx->key, ctx->chunk.chunk_counter + 1); } else { return; } } /* * Now the chunk_state is clear, and we have more input. If there's * more than a single chunk (so, definitely not the root chunk), hash * the largest whole subtree we can, with the full benefits of SIMD * (and maybe in the future, multi-threading) parallelism. Two * restrictions: * - The subtree has to be a power-of-2 number of chunks. Only * subtrees along the right edge can be incomplete, and we don't know * where the right edge is going to be until we get to finalize(). * - The subtree must evenly divide the total number of chunks up * until this point (if total is not 0). If the current incomplete * subtree is only waiting for 1 more chunk, we can't hash a subtree * of 4 chunks. We have to complete the current subtree first. * Because we might need to break up the input to form powers of 2, or * to evenly divide what we already have, this part runs in a loop. */ while (input_len > BLAKE3_CHUNK_LEN) { size_t subtree_len = round_down_to_power_of_2(input_len); uint64_t count_so_far = ctx->chunk.chunk_counter * BLAKE3_CHUNK_LEN; /* * Shrink the subtree_len until it evenly divides the count so * far. We know that subtree_len itself is a power of 2, so we * can use a bitmasking trick instead of an actual remainder * operation. (Note that if the caller consistently passes * power-of-2 inputs of the same size, as is hopefully * typical, this loop condition will always fail, and * subtree_len will always be the full length of the input.) * * An aside: We don't have to shrink subtree_len quite this * much. For example, if count_so_far is 1, we could pass 2 * chunks to compress_subtree_to_parent_node. Since we'll get * 2 CVs back, we'll still get the right answer in the end, * and we might get to use 2-way SIMD parallelism. The problem * with this optimization, is that it gets us stuck always * hashing 2 chunks. The total number of chunks will remain * odd, and we'll never graduate to higher degrees of * parallelism. See * https://github.com/BLAKE3-team/BLAKE3/issues/69. */ while ((((uint64_t)(subtree_len - 1)) & count_so_far) != 0) { subtree_len /= 2; } /* * The shrunken subtree_len might now be 1 chunk long. If so, * hash that one chunk by itself. Otherwise, compress the * subtree into a pair of CVs. */ uint64_t subtree_chunks = subtree_len / BLAKE3_CHUNK_LEN; if (subtree_len <= BLAKE3_CHUNK_LEN) { blake3_chunk_state_t chunk_state; chunk_state_init(&chunk_state, ctx->key, ctx->chunk.flags); chunk_state.chunk_counter = ctx->chunk.chunk_counter; chunk_state_update(ctx->ops, &chunk_state, input_bytes, subtree_len); output_t output = chunk_state_output(&chunk_state); uint8_t cv[BLAKE3_OUT_LEN]; output_chaining_value(ctx->ops, &output, cv); hasher_push_cv(ctx, cv, chunk_state.chunk_counter); } else { /* * This is the high-performance happy path, though * getting here depends on the caller giving us a long * enough input. */ uint8_t cv_pair[2 * BLAKE3_OUT_LEN]; compress_subtree_to_parent_node(ctx->ops, input_bytes, subtree_len, ctx->key, ctx-> chunk.chunk_counter, ctx->chunk.flags, cv_pair); hasher_push_cv(ctx, cv_pair, ctx->chunk.chunk_counter); hasher_push_cv(ctx, &cv_pair[BLAKE3_OUT_LEN], ctx->chunk.chunk_counter + (subtree_chunks / 2)); } ctx->chunk.chunk_counter += subtree_chunks; input_bytes += subtree_len; input_len -= subtree_len; } /* * If there's any remaining input less than a full chunk, add it to * the chunk state. In that case, also do a final merge loop to make * sure the subtree stack doesn't contain any unmerged pairs. The * remaining input means we know these merges are non-root. This merge * loop isn't strictly necessary here, because hasher_push_chunk_cv * already does its own merge loop, but it simplifies * blake3_hasher_finalize below. */ if (input_len > 0) { chunk_state_update(ctx->ops, &ctx->chunk, input_bytes, input_len); hasher_merge_cv_stack(ctx, ctx->chunk.chunk_counter); } } void Blake3_Update(BLAKE3_CTX *ctx, const void *input, size_t todo) { size_t done = 0; const uint8_t *data = input; const size_t block_max = 1024 * 64; /* max feed buffer to leave the stack size small */ while (todo != 0) { size_t block = (todo >= block_max) ? block_max : todo; Blake3_Update2(ctx, data + done, block); done += block; todo -= block; } } void Blake3_Final(const BLAKE3_CTX *ctx, uint8_t *out) { Blake3_FinalSeek(ctx, 0, out, BLAKE3_OUT_LEN); } void Blake3_FinalSeek(const BLAKE3_CTX *ctx, uint64_t seek, uint8_t *out, size_t out_len) { /* * Explicitly checking for zero avoids causing UB by passing a null * pointer to memcpy. This comes up in practice with things like: * std::vector v; * blake3_hasher_finalize(&hasher, v.data(), v.size()); */ if (out_len == 0) { return; } /* If the subtree stack is empty, then the current chunk is the root. */ if (ctx->cv_stack_len == 0) { output_t output = chunk_state_output(&ctx->chunk); output_root_bytes(ctx->ops, &output, seek, out, out_len); return; } /* * If there are any bytes in the chunk state, finalize that chunk and * do a roll-up merge between that chunk hash and every subtree in the * stack. In this case, the extra merge loop at the end of * blake3_hasher_update guarantees that none of the subtrees in the * stack need to be merged with each other first. Otherwise, if there * are no bytes in the chunk state, then the top of the stack is a * chunk hash, and we start the merge from that. */ output_t output; size_t cvs_remaining; if (chunk_state_len(&ctx->chunk) > 0) { cvs_remaining = ctx->cv_stack_len; output = chunk_state_output(&ctx->chunk); } else { /* There are always at least 2 CVs in the stack in this case. */ cvs_remaining = ctx->cv_stack_len - 2; output = parent_output(&ctx->cv_stack[cvs_remaining * 32], ctx->key, ctx->chunk.flags); } while (cvs_remaining > 0) { cvs_remaining -= 1; uint8_t parent_block[BLAKE3_BLOCK_LEN]; memcpy(parent_block, &ctx->cv_stack[cvs_remaining * 32], 32); output_chaining_value(ctx->ops, &output, &parent_block[32]); output = parent_output(parent_block, ctx->key, ctx->chunk.flags); } output_root_bytes(ctx->ops, &output, seek, out, out_len); } diff --git a/module/icp/algs/modes/ccm.c b/module/icp/algs/modes/ccm.c index ed5498dafaa1..4a8bb9bbc2c8 100644 --- a/module/icp/algs/modes/ccm.c +++ b/module/icp/algs/modes/ccm.c @@ -1,904 +1,903 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #include #include #include #include #ifdef HAVE_EFFICIENT_UNALIGNED_ACCESS #include #define UNALIGNED_POINTERS_PERMITTED #endif /* * Encrypt multiple blocks of data in CCM mode. Decrypt for CCM mode * is done in another function. */ int ccm_mode_encrypt_contiguous_blocks(ccm_ctx_t *ctx, char *data, size_t length, crypto_data_t *out, size_t block_size, int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), void (*copy_block)(uint8_t *, uint8_t *), void (*xor_block)(uint8_t *, uint8_t *)) { size_t remainder = length; size_t need = 0; uint8_t *datap = (uint8_t *)data; uint8_t *blockp; uint8_t *lastp; void *iov_or_mp; offset_t offset; uint8_t *out_data_1; uint8_t *out_data_2; size_t out_data_1_len; uint64_t counter; uint8_t *mac_buf; if (length + ctx->ccm_remainder_len < block_size) { /* accumulate bytes here and return */ memcpy((uint8_t *)ctx->ccm_remainder + ctx->ccm_remainder_len, datap, length); ctx->ccm_remainder_len += length; ctx->ccm_copy_to = datap; return (CRYPTO_SUCCESS); } - lastp = (uint8_t *)ctx->ccm_cb; crypto_init_ptrs(out, &iov_or_mp, &offset); mac_buf = (uint8_t *)ctx->ccm_mac_buf; do { /* Unprocessed data from last call. */ if (ctx->ccm_remainder_len > 0) { need = block_size - ctx->ccm_remainder_len; if (need > remainder) return (CRYPTO_DATA_LEN_RANGE); memcpy(&((uint8_t *)ctx->ccm_remainder) [ctx->ccm_remainder_len], datap, need); blockp = (uint8_t *)ctx->ccm_remainder; } else { blockp = datap; } /* * do CBC MAC * * XOR the previous cipher block current clear block. * mac_buf always contain previous cipher block. */ xor_block(blockp, mac_buf); encrypt_block(ctx->ccm_keysched, mac_buf, mac_buf); /* ccm_cb is the counter block */ encrypt_block(ctx->ccm_keysched, (uint8_t *)ctx->ccm_cb, (uint8_t *)ctx->ccm_tmp); lastp = (uint8_t *)ctx->ccm_tmp; /* * Increment counter. Counter bits are confined * to the bottom 64 bits of the counter block. */ #ifdef _ZFS_LITTLE_ENDIAN counter = ntohll(ctx->ccm_cb[1] & ctx->ccm_counter_mask); counter = htonll(counter + 1); #else counter = ctx->ccm_cb[1] & ctx->ccm_counter_mask; counter++; #endif /* _ZFS_LITTLE_ENDIAN */ counter &= ctx->ccm_counter_mask; ctx->ccm_cb[1] = (ctx->ccm_cb[1] & ~(ctx->ccm_counter_mask)) | counter; /* * XOR encrypted counter block with the current clear block. */ xor_block(blockp, lastp); ctx->ccm_processed_data_len += block_size; crypto_get_ptrs(out, &iov_or_mp, &offset, &out_data_1, &out_data_1_len, &out_data_2, block_size); /* copy block to where it belongs */ if (out_data_1_len == block_size) { copy_block(lastp, out_data_1); } else { memcpy(out_data_1, lastp, out_data_1_len); if (out_data_2 != NULL) { memcpy(out_data_2, lastp + out_data_1_len, block_size - out_data_1_len); } } /* update offset */ out->cd_offset += block_size; /* Update pointer to next block of data to be processed. */ if (ctx->ccm_remainder_len != 0) { datap += need; ctx->ccm_remainder_len = 0; } else { datap += block_size; } remainder = (size_t)&data[length] - (size_t)datap; /* Incomplete last block. */ if (remainder > 0 && remainder < block_size) { memcpy(ctx->ccm_remainder, datap, remainder); ctx->ccm_remainder_len = remainder; ctx->ccm_copy_to = datap; goto out; } ctx->ccm_copy_to = NULL; } while (remainder > 0); out: return (CRYPTO_SUCCESS); } void calculate_ccm_mac(ccm_ctx_t *ctx, uint8_t *ccm_mac, int (*encrypt_block)(const void *, const uint8_t *, uint8_t *)) { uint64_t counter; uint8_t *counterp, *mac_buf; int i; mac_buf = (uint8_t *)ctx->ccm_mac_buf; /* first counter block start with index 0 */ counter = 0; ctx->ccm_cb[1] = (ctx->ccm_cb[1] & ~(ctx->ccm_counter_mask)) | counter; counterp = (uint8_t *)ctx->ccm_tmp; encrypt_block(ctx->ccm_keysched, (uint8_t *)ctx->ccm_cb, counterp); /* calculate XOR of MAC with first counter block */ for (i = 0; i < ctx->ccm_mac_len; i++) { ccm_mac[i] = mac_buf[i] ^ counterp[i]; } } int ccm_encrypt_final(ccm_ctx_t *ctx, crypto_data_t *out, size_t block_size, int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), void (*xor_block)(uint8_t *, uint8_t *)) { uint8_t *lastp, *mac_buf, *ccm_mac_p, *macp = NULL; void *iov_or_mp; offset_t offset; uint8_t *out_data_1; uint8_t *out_data_2; size_t out_data_1_len; int i; if (out->cd_length < (ctx->ccm_remainder_len + ctx->ccm_mac_len)) { return (CRYPTO_DATA_LEN_RANGE); } /* * When we get here, the number of bytes of payload processed * plus whatever data remains, if any, * should be the same as the number of bytes that's being * passed in the argument during init time. */ if ((ctx->ccm_processed_data_len + ctx->ccm_remainder_len) != (ctx->ccm_data_len)) { return (CRYPTO_DATA_LEN_RANGE); } mac_buf = (uint8_t *)ctx->ccm_mac_buf; if (ctx->ccm_remainder_len > 0) { /* ccm_mac_input_buf is not used for encryption */ macp = (uint8_t *)ctx->ccm_mac_input_buf; memset(macp, 0, block_size); /* copy remainder to temporary buffer */ memcpy(macp, ctx->ccm_remainder, ctx->ccm_remainder_len); /* calculate the CBC MAC */ xor_block(macp, mac_buf); encrypt_block(ctx->ccm_keysched, mac_buf, mac_buf); /* calculate the counter mode */ lastp = (uint8_t *)ctx->ccm_tmp; encrypt_block(ctx->ccm_keysched, (uint8_t *)ctx->ccm_cb, lastp); /* XOR with counter block */ for (i = 0; i < ctx->ccm_remainder_len; i++) { macp[i] ^= lastp[i]; } ctx->ccm_processed_data_len += ctx->ccm_remainder_len; } /* Calculate the CCM MAC */ ccm_mac_p = (uint8_t *)ctx->ccm_tmp; calculate_ccm_mac(ctx, ccm_mac_p, encrypt_block); crypto_init_ptrs(out, &iov_or_mp, &offset); crypto_get_ptrs(out, &iov_or_mp, &offset, &out_data_1, &out_data_1_len, &out_data_2, ctx->ccm_remainder_len + ctx->ccm_mac_len); if (ctx->ccm_remainder_len > 0) { /* copy temporary block to where it belongs */ if (out_data_2 == NULL) { /* everything will fit in out_data_1 */ memcpy(out_data_1, macp, ctx->ccm_remainder_len); memcpy(out_data_1 + ctx->ccm_remainder_len, ccm_mac_p, ctx->ccm_mac_len); } else { if (out_data_1_len < ctx->ccm_remainder_len) { size_t data_2_len_used; memcpy(out_data_1, macp, out_data_1_len); data_2_len_used = ctx->ccm_remainder_len - out_data_1_len; memcpy(out_data_2, (uint8_t *)macp + out_data_1_len, data_2_len_used); memcpy(out_data_2 + data_2_len_used, ccm_mac_p, ctx->ccm_mac_len); } else { memcpy(out_data_1, macp, out_data_1_len); if (out_data_1_len == ctx->ccm_remainder_len) { /* mac will be in out_data_2 */ memcpy(out_data_2, ccm_mac_p, ctx->ccm_mac_len); } else { size_t len_not_used = out_data_1_len - ctx->ccm_remainder_len; /* * part of mac in will be in * out_data_1, part of the mac will be * in out_data_2 */ memcpy(out_data_1 + ctx->ccm_remainder_len, ccm_mac_p, len_not_used); memcpy(out_data_2, ccm_mac_p + len_not_used, ctx->ccm_mac_len - len_not_used); } } } } else { /* copy block to where it belongs */ memcpy(out_data_1, ccm_mac_p, out_data_1_len); if (out_data_2 != NULL) { memcpy(out_data_2, ccm_mac_p + out_data_1_len, block_size - out_data_1_len); } } out->cd_offset += ctx->ccm_remainder_len + ctx->ccm_mac_len; ctx->ccm_remainder_len = 0; return (CRYPTO_SUCCESS); } /* * This will only deal with decrypting the last block of the input that * might not be a multiple of block length. */ static void ccm_decrypt_incomplete_block(ccm_ctx_t *ctx, int (*encrypt_block)(const void *, const uint8_t *, uint8_t *)) { uint8_t *datap, *outp, *counterp; int i; datap = (uint8_t *)ctx->ccm_remainder; outp = &((ctx->ccm_pt_buf)[ctx->ccm_processed_data_len]); counterp = (uint8_t *)ctx->ccm_tmp; encrypt_block(ctx->ccm_keysched, (uint8_t *)ctx->ccm_cb, counterp); /* XOR with counter block */ for (i = 0; i < ctx->ccm_remainder_len; i++) { outp[i] = datap[i] ^ counterp[i]; } } /* * This will decrypt the cipher text. However, the plaintext won't be * returned to the caller. It will be returned when decrypt_final() is * called if the MAC matches */ int ccm_mode_decrypt_contiguous_blocks(ccm_ctx_t *ctx, char *data, size_t length, crypto_data_t *out, size_t block_size, int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), void (*copy_block)(uint8_t *, uint8_t *), void (*xor_block)(uint8_t *, uint8_t *)) { (void) out; size_t remainder = length; size_t need = 0; uint8_t *datap = (uint8_t *)data; uint8_t *blockp; uint8_t *cbp; uint64_t counter; size_t pt_len, total_decrypted_len, mac_len, pm_len, pd_len; uint8_t *resultp; pm_len = ctx->ccm_processed_mac_len; if (pm_len > 0) { uint8_t *tmp; /* * all ciphertext has been processed, just waiting for * part of the value of the mac */ if ((pm_len + length) > ctx->ccm_mac_len) { return (CRYPTO_ENCRYPTED_DATA_LEN_RANGE); } tmp = (uint8_t *)ctx->ccm_mac_input_buf; memcpy(tmp + pm_len, datap, length); ctx->ccm_processed_mac_len += length; return (CRYPTO_SUCCESS); } /* * If we decrypt the given data, what total amount of data would * have been decrypted? */ pd_len = ctx->ccm_processed_data_len; total_decrypted_len = pd_len + length + ctx->ccm_remainder_len; if (total_decrypted_len > (ctx->ccm_data_len + ctx->ccm_mac_len)) { return (CRYPTO_ENCRYPTED_DATA_LEN_RANGE); } pt_len = ctx->ccm_data_len; if (total_decrypted_len > pt_len) { /* * part of the input will be the MAC, need to isolate that * to be dealt with later. The left-over data in * ccm_remainder_len from last time will not be part of the * MAC. Otherwise, it would have already been taken out * when this call is made last time. */ size_t pt_part = pt_len - pd_len - ctx->ccm_remainder_len; mac_len = length - pt_part; ctx->ccm_processed_mac_len = mac_len; memcpy(ctx->ccm_mac_input_buf, data + pt_part, mac_len); if (pt_part + ctx->ccm_remainder_len < block_size) { /* * since this is last of the ciphertext, will * just decrypt with it here */ memcpy(&((uint8_t *)ctx->ccm_remainder) [ctx->ccm_remainder_len], datap, pt_part); ctx->ccm_remainder_len += pt_part; ccm_decrypt_incomplete_block(ctx, encrypt_block); ctx->ccm_processed_data_len += ctx->ccm_remainder_len; ctx->ccm_remainder_len = 0; return (CRYPTO_SUCCESS); } else { /* let rest of the code handle this */ length = pt_part; } } else if (length + ctx->ccm_remainder_len < block_size) { /* accumulate bytes here and return */ memcpy((uint8_t *)ctx->ccm_remainder + ctx->ccm_remainder_len, datap, length); ctx->ccm_remainder_len += length; ctx->ccm_copy_to = datap; return (CRYPTO_SUCCESS); } do { /* Unprocessed data from last call. */ if (ctx->ccm_remainder_len > 0) { need = block_size - ctx->ccm_remainder_len; if (need > remainder) return (CRYPTO_ENCRYPTED_DATA_LEN_RANGE); memcpy(&((uint8_t *)ctx->ccm_remainder) [ctx->ccm_remainder_len], datap, need); blockp = (uint8_t *)ctx->ccm_remainder; } else { blockp = datap; } /* Calculate the counter mode, ccm_cb is the counter block */ cbp = (uint8_t *)ctx->ccm_tmp; encrypt_block(ctx->ccm_keysched, (uint8_t *)ctx->ccm_cb, cbp); /* * Increment counter. * Counter bits are confined to the bottom 64 bits */ #ifdef _ZFS_LITTLE_ENDIAN counter = ntohll(ctx->ccm_cb[1] & ctx->ccm_counter_mask); counter = htonll(counter + 1); #else counter = ctx->ccm_cb[1] & ctx->ccm_counter_mask; counter++; #endif /* _ZFS_LITTLE_ENDIAN */ counter &= ctx->ccm_counter_mask; ctx->ccm_cb[1] = (ctx->ccm_cb[1] & ~(ctx->ccm_counter_mask)) | counter; /* XOR with the ciphertext */ xor_block(blockp, cbp); /* Copy the plaintext to the "holding buffer" */ resultp = (uint8_t *)ctx->ccm_pt_buf + ctx->ccm_processed_data_len; copy_block(cbp, resultp); ctx->ccm_processed_data_len += block_size; ctx->ccm_lastp = blockp; /* Update pointer to next block of data to be processed. */ if (ctx->ccm_remainder_len != 0) { datap += need; ctx->ccm_remainder_len = 0; } else { datap += block_size; } remainder = (size_t)&data[length] - (size_t)datap; /* Incomplete last block */ if (remainder > 0 && remainder < block_size) { memcpy(ctx->ccm_remainder, datap, remainder); ctx->ccm_remainder_len = remainder; ctx->ccm_copy_to = datap; if (ctx->ccm_processed_mac_len > 0) { /* * not expecting anymore ciphertext, just * compute plaintext for the remaining input */ ccm_decrypt_incomplete_block(ctx, encrypt_block); ctx->ccm_processed_data_len += remainder; ctx->ccm_remainder_len = 0; } goto out; } ctx->ccm_copy_to = NULL; } while (remainder > 0); out: return (CRYPTO_SUCCESS); } int ccm_decrypt_final(ccm_ctx_t *ctx, crypto_data_t *out, size_t block_size, int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), void (*copy_block)(uint8_t *, uint8_t *), void (*xor_block)(uint8_t *, uint8_t *)) { size_t mac_remain, pt_len; uint8_t *pt, *mac_buf, *macp, *ccm_mac_p; int rv; pt_len = ctx->ccm_data_len; /* Make sure output buffer can fit all of the plaintext */ if (out->cd_length < pt_len) { return (CRYPTO_DATA_LEN_RANGE); } pt = ctx->ccm_pt_buf; mac_remain = ctx->ccm_processed_data_len; mac_buf = (uint8_t *)ctx->ccm_mac_buf; macp = (uint8_t *)ctx->ccm_tmp; while (mac_remain > 0) { if (mac_remain < block_size) { memset(macp, 0, block_size); memcpy(macp, pt, mac_remain); mac_remain = 0; } else { copy_block(pt, macp); mac_remain -= block_size; pt += block_size; } /* calculate the CBC MAC */ xor_block(macp, mac_buf); encrypt_block(ctx->ccm_keysched, mac_buf, mac_buf); } /* Calculate the CCM MAC */ ccm_mac_p = (uint8_t *)ctx->ccm_tmp; calculate_ccm_mac((ccm_ctx_t *)ctx, ccm_mac_p, encrypt_block); /* compare the input CCM MAC value with what we calculated */ if (memcmp(ctx->ccm_mac_input_buf, ccm_mac_p, ctx->ccm_mac_len)) { /* They don't match */ return (CRYPTO_INVALID_MAC); } else { rv = crypto_put_output_data(ctx->ccm_pt_buf, out, pt_len); if (rv != CRYPTO_SUCCESS) return (rv); out->cd_offset += pt_len; } return (CRYPTO_SUCCESS); } static int ccm_validate_args(CK_AES_CCM_PARAMS *ccm_param, boolean_t is_encrypt_init) { size_t macSize, nonceSize; uint8_t q; uint64_t maxValue; /* * Check the length of the MAC. The only valid * lengths for the MAC are: 4, 6, 8, 10, 12, 14, 16 */ macSize = ccm_param->ulMACSize; if ((macSize < 4) || (macSize > 16) || ((macSize % 2) != 0)) { return (CRYPTO_MECHANISM_PARAM_INVALID); } /* Check the nonce length. Valid values are 7, 8, 9, 10, 11, 12, 13 */ nonceSize = ccm_param->ulNonceSize; if ((nonceSize < 7) || (nonceSize > 13)) { return (CRYPTO_MECHANISM_PARAM_INVALID); } /* q is the length of the field storing the length, in bytes */ q = (uint8_t)((15 - nonceSize) & 0xFF); /* * If it is decrypt, need to make sure size of ciphertext is at least * bigger than MAC len */ if ((!is_encrypt_init) && (ccm_param->ulDataSize < macSize)) { return (CRYPTO_MECHANISM_PARAM_INVALID); } /* * Check to make sure the length of the payload is within the * range of values allowed by q */ if (q < 8) { maxValue = (1ULL << (q * 8)) - 1; } else { maxValue = ULONG_MAX; } if (ccm_param->ulDataSize > maxValue) { return (CRYPTO_MECHANISM_PARAM_INVALID); } return (CRYPTO_SUCCESS); } /* * Format the first block used in CBC-MAC (B0) and the initial counter * block based on formatting functions and counter generation functions * specified in RFC 3610 and NIST publication 800-38C, appendix A * * b0 is the first block used in CBC-MAC * cb0 is the first counter block * * It's assumed that the arguments b0 and cb0 are preallocated AES blocks * */ static void ccm_format_initial_blocks(uchar_t *nonce, ulong_t nonceSize, ulong_t authDataSize, uint8_t *b0, ccm_ctx_t *aes_ctx) { uint64_t payloadSize; uint8_t t, q, have_adata = 0; size_t limit; int i, j, k; uint64_t mask = 0; uint8_t *cb; q = (uint8_t)((15 - nonceSize) & 0xFF); t = (uint8_t)((aes_ctx->ccm_mac_len) & 0xFF); /* Construct the first octet of b0 */ if (authDataSize > 0) { have_adata = 1; } b0[0] = (have_adata << 6) | (((t - 2) / 2) << 3) | (q - 1); /* copy the nonce value into b0 */ memcpy(&(b0[1]), nonce, nonceSize); /* store the length of the payload into b0 */ memset(&(b0[1+nonceSize]), 0, q); payloadSize = aes_ctx->ccm_data_len; limit = 8 < q ? 8 : q; for (i = 0, j = 0, k = 15; i < limit; i++, j += 8, k--) { b0[k] = (uint8_t)((payloadSize >> j) & 0xFF); } /* format the counter block */ cb = (uint8_t *)aes_ctx->ccm_cb; cb[0] = 0x07 & (q-1); /* first byte */ /* copy the nonce value into the counter block */ memcpy(&(cb[1]), nonce, nonceSize); memset(&(cb[1+nonceSize]), 0, q); /* Create the mask for the counter field based on the size of nonce */ q <<= 3; while (q-- > 0) { mask |= (1ULL << q); } #ifdef _ZFS_LITTLE_ENDIAN mask = htonll(mask); #endif aes_ctx->ccm_counter_mask = mask; /* * During calculation, we start using counter block 1, we will * set it up right here. * We can just set the last byte to have the value 1, because * even with the biggest nonce of 13, the last byte of the * counter block will be used for the counter value. */ cb[15] = 0x01; } /* * Encode the length of the associated data as * specified in RFC 3610 and NIST publication 800-38C, appendix A */ static void encode_adata_len(ulong_t auth_data_len, uint8_t *encoded, size_t *encoded_len) { #ifdef UNALIGNED_POINTERS_PERMITTED uint32_t *lencoded_ptr; #ifdef _LP64 uint64_t *llencoded_ptr; #endif #endif /* UNALIGNED_POINTERS_PERMITTED */ if (auth_data_len < ((1ULL<<16) - (1ULL<<8))) { /* 0 < a < (2^16-2^8) */ *encoded_len = 2; encoded[0] = (auth_data_len & 0xff00) >> 8; encoded[1] = auth_data_len & 0xff; } else if ((auth_data_len >= ((1ULL<<16) - (1ULL<<8))) && (auth_data_len < (1ULL << 31))) { /* (2^16-2^8) <= a < 2^32 */ *encoded_len = 6; encoded[0] = 0xff; encoded[1] = 0xfe; #ifdef UNALIGNED_POINTERS_PERMITTED lencoded_ptr = (uint32_t *)&encoded[2]; *lencoded_ptr = htonl(auth_data_len); #else encoded[2] = (auth_data_len & 0xff000000) >> 24; encoded[3] = (auth_data_len & 0xff0000) >> 16; encoded[4] = (auth_data_len & 0xff00) >> 8; encoded[5] = auth_data_len & 0xff; #endif /* UNALIGNED_POINTERS_PERMITTED */ #ifdef _LP64 } else { /* 2^32 <= a < 2^64 */ *encoded_len = 10; encoded[0] = 0xff; encoded[1] = 0xff; #ifdef UNALIGNED_POINTERS_PERMITTED llencoded_ptr = (uint64_t *)&encoded[2]; *llencoded_ptr = htonl(auth_data_len); #else encoded[2] = (auth_data_len & 0xff00000000000000) >> 56; encoded[3] = (auth_data_len & 0xff000000000000) >> 48; encoded[4] = (auth_data_len & 0xff0000000000) >> 40; encoded[5] = (auth_data_len & 0xff00000000) >> 32; encoded[6] = (auth_data_len & 0xff000000) >> 24; encoded[7] = (auth_data_len & 0xff0000) >> 16; encoded[8] = (auth_data_len & 0xff00) >> 8; encoded[9] = auth_data_len & 0xff; #endif /* UNALIGNED_POINTERS_PERMITTED */ #endif /* _LP64 */ } } static int ccm_init(ccm_ctx_t *ctx, unsigned char *nonce, size_t nonce_len, unsigned char *auth_data, size_t auth_data_len, size_t block_size, int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), void (*xor_block)(uint8_t *, uint8_t *)) { uint8_t *mac_buf, *datap, *ivp, *authp; size_t remainder, processed; uint8_t encoded_a[10]; /* max encoded auth data length is 10 octets */ size_t encoded_a_len = 0; mac_buf = (uint8_t *)&(ctx->ccm_mac_buf); /* * Format the 1st block for CBC-MAC and construct the * 1st counter block. * * aes_ctx->ccm_iv is used for storing the counter block * mac_buf will store b0 at this time. */ ccm_format_initial_blocks(nonce, nonce_len, auth_data_len, mac_buf, ctx); /* The IV for CBC MAC for AES CCM mode is always zero */ ivp = (uint8_t *)ctx->ccm_tmp; memset(ivp, 0, block_size); xor_block(ivp, mac_buf); /* encrypt the nonce */ encrypt_block(ctx->ccm_keysched, mac_buf, mac_buf); /* take care of the associated data, if any */ if (auth_data_len == 0) { return (CRYPTO_SUCCESS); } encode_adata_len(auth_data_len, encoded_a, &encoded_a_len); remainder = auth_data_len; /* 1st block: it contains encoded associated data, and some data */ authp = (uint8_t *)ctx->ccm_tmp; memset(authp, 0, block_size); memcpy(authp, encoded_a, encoded_a_len); processed = block_size - encoded_a_len; if (processed > auth_data_len) { /* in case auth_data is very small */ processed = auth_data_len; } memcpy(authp+encoded_a_len, auth_data, processed); /* xor with previous buffer */ xor_block(authp, mac_buf); encrypt_block(ctx->ccm_keysched, mac_buf, mac_buf); remainder -= processed; if (remainder == 0) { /* a small amount of associated data, it's all done now */ return (CRYPTO_SUCCESS); } do { if (remainder < block_size) { /* * There's not a block full of data, pad rest of * buffer with zero */ memset(authp, 0, block_size); memcpy(authp, &(auth_data[processed]), remainder); datap = (uint8_t *)authp; remainder = 0; } else { datap = (uint8_t *)(&(auth_data[processed])); processed += block_size; remainder -= block_size; } xor_block(datap, mac_buf); encrypt_block(ctx->ccm_keysched, mac_buf, mac_buf); } while (remainder > 0); return (CRYPTO_SUCCESS); } /* * The following function should be call at encrypt or decrypt init time * for AES CCM mode. */ int ccm_init_ctx(ccm_ctx_t *ccm_ctx, char *param, int kmflag, boolean_t is_encrypt_init, size_t block_size, int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), void (*xor_block)(uint8_t *, uint8_t *)) { int rv; CK_AES_CCM_PARAMS *ccm_param; if (param != NULL) { ccm_param = (CK_AES_CCM_PARAMS *)param; if ((rv = ccm_validate_args(ccm_param, is_encrypt_init)) != 0) { return (rv); } ccm_ctx->ccm_mac_len = ccm_param->ulMACSize; if (is_encrypt_init) { ccm_ctx->ccm_data_len = ccm_param->ulDataSize; } else { ccm_ctx->ccm_data_len = ccm_param->ulDataSize - ccm_ctx->ccm_mac_len; ccm_ctx->ccm_processed_mac_len = 0; } ccm_ctx->ccm_processed_data_len = 0; ccm_ctx->ccm_flags |= CCM_MODE; } else { return (CRYPTO_MECHANISM_PARAM_INVALID); } if (ccm_init(ccm_ctx, ccm_param->nonce, ccm_param->ulNonceSize, ccm_param->authData, ccm_param->ulAuthDataSize, block_size, encrypt_block, xor_block) != 0) { return (CRYPTO_MECHANISM_PARAM_INVALID); } if (!is_encrypt_init) { /* allocate buffer for storing decrypted plaintext */ ccm_ctx->ccm_pt_buf = vmem_alloc(ccm_ctx->ccm_data_len, kmflag); if (ccm_ctx->ccm_pt_buf == NULL) { rv = CRYPTO_HOST_MEMORY; } } return (rv); } void * ccm_alloc_ctx(int kmflag) { ccm_ctx_t *ccm_ctx; if ((ccm_ctx = kmem_zalloc(sizeof (ccm_ctx_t), kmflag)) == NULL) return (NULL); ccm_ctx->ccm_flags = CCM_MODE; return (ccm_ctx); } diff --git a/module/icp/algs/modes/ctr.c b/module/icp/algs/modes/ctr.c index c116ba3662ba..db6b1c71d5cd 100644 --- a/module/icp/algs/modes/ctr.c +++ b/module/icp/algs/modes/ctr.c @@ -1,228 +1,227 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #include #include #include #include #include /* * Encrypt and decrypt multiple blocks of data in counter mode. */ int ctr_mode_contiguous_blocks(ctr_ctx_t *ctx, char *data, size_t length, crypto_data_t *out, size_t block_size, int (*cipher)(const void *ks, const uint8_t *pt, uint8_t *ct), void (*xor_block)(uint8_t *, uint8_t *)) { size_t remainder = length; size_t need = 0; uint8_t *datap = (uint8_t *)data; uint8_t *blockp; uint8_t *lastp; void *iov_or_mp; offset_t offset; uint8_t *out_data_1; uint8_t *out_data_2; size_t out_data_1_len; uint64_t lower_counter, upper_counter; if (length + ctx->ctr_remainder_len < block_size) { /* accumulate bytes here and return */ memcpy((uint8_t *)ctx->ctr_remainder + ctx->ctr_remainder_len, datap, length); ctx->ctr_remainder_len += length; ctx->ctr_copy_to = datap; return (CRYPTO_SUCCESS); } - lastp = (uint8_t *)ctx->ctr_cb; crypto_init_ptrs(out, &iov_or_mp, &offset); do { /* Unprocessed data from last call. */ if (ctx->ctr_remainder_len > 0) { need = block_size - ctx->ctr_remainder_len; if (need > remainder) return (CRYPTO_DATA_LEN_RANGE); memcpy(&((uint8_t *)ctx->ctr_remainder) [ctx->ctr_remainder_len], datap, need); blockp = (uint8_t *)ctx->ctr_remainder; } else { blockp = datap; } /* ctr_cb is the counter block */ cipher(ctx->ctr_keysched, (uint8_t *)ctx->ctr_cb, (uint8_t *)ctx->ctr_tmp); lastp = (uint8_t *)ctx->ctr_tmp; /* * Increment Counter. */ lower_counter = ntohll(ctx->ctr_cb[1] & ctx->ctr_lower_mask); lower_counter = htonll(lower_counter + 1); lower_counter &= ctx->ctr_lower_mask; ctx->ctr_cb[1] = (ctx->ctr_cb[1] & ~(ctx->ctr_lower_mask)) | lower_counter; /* wrap around */ if (lower_counter == 0) { upper_counter = ntohll(ctx->ctr_cb[0] & ctx->ctr_upper_mask); upper_counter = htonll(upper_counter + 1); upper_counter &= ctx->ctr_upper_mask; ctx->ctr_cb[0] = (ctx->ctr_cb[0] & ~(ctx->ctr_upper_mask)) | upper_counter; } /* * XOR encrypted counter block with the current clear block. */ xor_block(blockp, lastp); crypto_get_ptrs(out, &iov_or_mp, &offset, &out_data_1, &out_data_1_len, &out_data_2, block_size); /* copy block to where it belongs */ memcpy(out_data_1, lastp, out_data_1_len); if (out_data_2 != NULL) { memcpy(out_data_2, lastp + out_data_1_len, block_size - out_data_1_len); } /* update offset */ out->cd_offset += block_size; /* Update pointer to next block of data to be processed. */ if (ctx->ctr_remainder_len != 0) { datap += need; ctx->ctr_remainder_len = 0; } else { datap += block_size; } remainder = (size_t)&data[length] - (size_t)datap; /* Incomplete last block. */ if (remainder > 0 && remainder < block_size) { memcpy(ctx->ctr_remainder, datap, remainder); ctx->ctr_remainder_len = remainder; ctx->ctr_copy_to = datap; goto out; } ctx->ctr_copy_to = NULL; } while (remainder > 0); out: return (CRYPTO_SUCCESS); } int ctr_mode_final(ctr_ctx_t *ctx, crypto_data_t *out, int (*encrypt_block)(const void *, const uint8_t *, uint8_t *)) { uint8_t *lastp; void *iov_or_mp; offset_t offset; uint8_t *out_data_1; uint8_t *out_data_2; size_t out_data_1_len; uint8_t *p; int i; if (out->cd_length < ctx->ctr_remainder_len) return (CRYPTO_DATA_LEN_RANGE); encrypt_block(ctx->ctr_keysched, (uint8_t *)ctx->ctr_cb, (uint8_t *)ctx->ctr_tmp); lastp = (uint8_t *)ctx->ctr_tmp; p = (uint8_t *)ctx->ctr_remainder; for (i = 0; i < ctx->ctr_remainder_len; i++) { p[i] ^= lastp[i]; } crypto_init_ptrs(out, &iov_or_mp, &offset); crypto_get_ptrs(out, &iov_or_mp, &offset, &out_data_1, &out_data_1_len, &out_data_2, ctx->ctr_remainder_len); memcpy(out_data_1, p, out_data_1_len); if (out_data_2 != NULL) { memcpy(out_data_2, (uint8_t *)p + out_data_1_len, ctx->ctr_remainder_len - out_data_1_len); } out->cd_offset += ctx->ctr_remainder_len; ctx->ctr_remainder_len = 0; return (CRYPTO_SUCCESS); } int ctr_init_ctx(ctr_ctx_t *ctr_ctx, ulong_t count, uint8_t *cb, void (*copy_block)(uint8_t *, uint8_t *)) { uint64_t upper_mask = 0; uint64_t lower_mask = 0; if (count == 0 || count > 128) { return (CRYPTO_MECHANISM_PARAM_INVALID); } /* upper 64 bits of the mask */ if (count >= 64) { count -= 64; upper_mask = (count == 64) ? UINT64_MAX : (1ULL << count) - 1; lower_mask = UINT64_MAX; } else { /* now the lower 63 bits */ lower_mask = (1ULL << count) - 1; } ctr_ctx->ctr_lower_mask = htonll(lower_mask); ctr_ctx->ctr_upper_mask = htonll(upper_mask); copy_block(cb, (uchar_t *)ctr_ctx->ctr_cb); ctr_ctx->ctr_lastp = (uint8_t *)&ctr_ctx->ctr_cb[0]; ctr_ctx->ctr_flags |= CTR_MODE; return (CRYPTO_SUCCESS); } void * ctr_alloc_ctx(int kmflag) { ctr_ctx_t *ctr_ctx; if ((ctr_ctx = kmem_zalloc(sizeof (ctr_ctx_t), kmflag)) == NULL) return (NULL); ctr_ctx->ctr_flags = CTR_MODE; return (ctr_ctx); } diff --git a/module/icp/algs/modes/gcm.c b/module/icp/algs/modes/gcm.c index ca328d54a7e6..16ef14b8ccaf 100644 --- a/module/icp/algs/modes/gcm.c +++ b/module/icp/algs/modes/gcm.c @@ -1,1595 +1,1594 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. */ #include #include #include #include #include #include #include #include #ifdef CAN_USE_GCM_ASM #include #endif #define GHASH(c, d, t, o) \ xor_block((uint8_t *)(d), (uint8_t *)(c)->gcm_ghash); \ (o)->mul((uint64_t *)(void *)(c)->gcm_ghash, (c)->gcm_H, \ (uint64_t *)(void *)(t)); /* Select GCM implementation */ #define IMPL_FASTEST (UINT32_MAX) #define IMPL_CYCLE (UINT32_MAX-1) #ifdef CAN_USE_GCM_ASM #define IMPL_AVX (UINT32_MAX-2) #endif #define GCM_IMPL_READ(i) (*(volatile uint32_t *) &(i)) static uint32_t icp_gcm_impl = IMPL_FASTEST; static uint32_t user_sel_impl = IMPL_FASTEST; #ifdef CAN_USE_GCM_ASM /* Does the architecture we run on support the MOVBE instruction? */ boolean_t gcm_avx_can_use_movbe = B_FALSE; /* * Whether to use the optimized openssl gcm and ghash implementations. * Set to true if module parameter icp_gcm_impl == "avx". */ static boolean_t gcm_use_avx = B_FALSE; #define GCM_IMPL_USE_AVX (*(volatile boolean_t *)&gcm_use_avx) extern boolean_t atomic_toggle_boolean_nv(volatile boolean_t *); static inline boolean_t gcm_avx_will_work(void); static inline void gcm_set_avx(boolean_t); static inline boolean_t gcm_toggle_avx(void); static inline size_t gcm_simd_get_htab_size(boolean_t); static int gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *, char *, size_t, crypto_data_t *, size_t); static int gcm_encrypt_final_avx(gcm_ctx_t *, crypto_data_t *, size_t); static int gcm_decrypt_final_avx(gcm_ctx_t *, crypto_data_t *, size_t); static int gcm_init_avx(gcm_ctx_t *, unsigned char *, size_t, unsigned char *, size_t, size_t); #endif /* ifdef CAN_USE_GCM_ASM */ /* * Encrypt multiple blocks of data in GCM mode. Decrypt for GCM mode * is done in another function. */ int gcm_mode_encrypt_contiguous_blocks(gcm_ctx_t *ctx, char *data, size_t length, crypto_data_t *out, size_t block_size, int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), void (*copy_block)(uint8_t *, uint8_t *), void (*xor_block)(uint8_t *, uint8_t *)) { #ifdef CAN_USE_GCM_ASM if (ctx->gcm_use_avx == B_TRUE) return (gcm_mode_encrypt_contiguous_blocks_avx( ctx, data, length, out, block_size)); #endif const gcm_impl_ops_t *gops; size_t remainder = length; size_t need = 0; uint8_t *datap = (uint8_t *)data; uint8_t *blockp; uint8_t *lastp; void *iov_or_mp; offset_t offset; uint8_t *out_data_1; uint8_t *out_data_2; size_t out_data_1_len; uint64_t counter; uint64_t counter_mask = ntohll(0x00000000ffffffffULL); if (length + ctx->gcm_remainder_len < block_size) { /* accumulate bytes here and return */ memcpy((uint8_t *)ctx->gcm_remainder + ctx->gcm_remainder_len, datap, length); ctx->gcm_remainder_len += length; if (ctx->gcm_copy_to == NULL) { ctx->gcm_copy_to = datap; } return (CRYPTO_SUCCESS); } - lastp = (uint8_t *)ctx->gcm_cb; crypto_init_ptrs(out, &iov_or_mp, &offset); gops = gcm_impl_get_ops(); do { /* Unprocessed data from last call. */ if (ctx->gcm_remainder_len > 0) { need = block_size - ctx->gcm_remainder_len; if (need > remainder) return (CRYPTO_DATA_LEN_RANGE); memcpy(&((uint8_t *)ctx->gcm_remainder) [ctx->gcm_remainder_len], datap, need); blockp = (uint8_t *)ctx->gcm_remainder; } else { blockp = datap; } /* * Increment counter. Counter bits are confined * to the bottom 32 bits of the counter block. */ counter = ntohll(ctx->gcm_cb[1] & counter_mask); counter = htonll(counter + 1); counter &= counter_mask; ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter; encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb, (uint8_t *)ctx->gcm_tmp); xor_block(blockp, (uint8_t *)ctx->gcm_tmp); lastp = (uint8_t *)ctx->gcm_tmp; ctx->gcm_processed_data_len += block_size; crypto_get_ptrs(out, &iov_or_mp, &offset, &out_data_1, &out_data_1_len, &out_data_2, block_size); /* copy block to where it belongs */ if (out_data_1_len == block_size) { copy_block(lastp, out_data_1); } else { memcpy(out_data_1, lastp, out_data_1_len); if (out_data_2 != NULL) { memcpy(out_data_2, lastp + out_data_1_len, block_size - out_data_1_len); } } /* update offset */ out->cd_offset += block_size; /* add ciphertext to the hash */ GHASH(ctx, ctx->gcm_tmp, ctx->gcm_ghash, gops); /* Update pointer to next block of data to be processed. */ if (ctx->gcm_remainder_len != 0) { datap += need; ctx->gcm_remainder_len = 0; } else { datap += block_size; } remainder = (size_t)&data[length] - (size_t)datap; /* Incomplete last block. */ if (remainder > 0 && remainder < block_size) { memcpy(ctx->gcm_remainder, datap, remainder); ctx->gcm_remainder_len = remainder; ctx->gcm_copy_to = datap; goto out; } ctx->gcm_copy_to = NULL; } while (remainder > 0); out: return (CRYPTO_SUCCESS); } int gcm_encrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size, int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), void (*copy_block)(uint8_t *, uint8_t *), void (*xor_block)(uint8_t *, uint8_t *)) { (void) copy_block; #ifdef CAN_USE_GCM_ASM if (ctx->gcm_use_avx == B_TRUE) return (gcm_encrypt_final_avx(ctx, out, block_size)); #endif const gcm_impl_ops_t *gops; uint64_t counter_mask = ntohll(0x00000000ffffffffULL); uint8_t *ghash, *macp = NULL; int i, rv; if (out->cd_length < (ctx->gcm_remainder_len + ctx->gcm_tag_len)) { return (CRYPTO_DATA_LEN_RANGE); } gops = gcm_impl_get_ops(); ghash = (uint8_t *)ctx->gcm_ghash; if (ctx->gcm_remainder_len > 0) { uint64_t counter; uint8_t *tmpp = (uint8_t *)ctx->gcm_tmp; /* * Here is where we deal with data that is not a * multiple of the block size. */ /* * Increment counter. */ counter = ntohll(ctx->gcm_cb[1] & counter_mask); counter = htonll(counter + 1); counter &= counter_mask; ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter; encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb, (uint8_t *)ctx->gcm_tmp); macp = (uint8_t *)ctx->gcm_remainder; memset(macp + ctx->gcm_remainder_len, 0, block_size - ctx->gcm_remainder_len); /* XOR with counter block */ for (i = 0; i < ctx->gcm_remainder_len; i++) { macp[i] ^= tmpp[i]; } /* add ciphertext to the hash */ GHASH(ctx, macp, ghash, gops); ctx->gcm_processed_data_len += ctx->gcm_remainder_len; } ctx->gcm_len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(ctx->gcm_processed_data_len)); GHASH(ctx, ctx->gcm_len_a_len_c, ghash, gops); encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_J0, (uint8_t *)ctx->gcm_J0); xor_block((uint8_t *)ctx->gcm_J0, ghash); if (ctx->gcm_remainder_len > 0) { rv = crypto_put_output_data(macp, out, ctx->gcm_remainder_len); if (rv != CRYPTO_SUCCESS) return (rv); } out->cd_offset += ctx->gcm_remainder_len; ctx->gcm_remainder_len = 0; rv = crypto_put_output_data(ghash, out, ctx->gcm_tag_len); if (rv != CRYPTO_SUCCESS) return (rv); out->cd_offset += ctx->gcm_tag_len; return (CRYPTO_SUCCESS); } /* * This will only deal with decrypting the last block of the input that * might not be a multiple of block length. */ static void gcm_decrypt_incomplete_block(gcm_ctx_t *ctx, size_t block_size, size_t index, int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), void (*xor_block)(uint8_t *, uint8_t *)) { uint8_t *datap, *outp, *counterp; uint64_t counter; uint64_t counter_mask = ntohll(0x00000000ffffffffULL); int i; /* * Increment counter. * Counter bits are confined to the bottom 32 bits */ counter = ntohll(ctx->gcm_cb[1] & counter_mask); counter = htonll(counter + 1); counter &= counter_mask; ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter; datap = (uint8_t *)ctx->gcm_remainder; outp = &((ctx->gcm_pt_buf)[index]); counterp = (uint8_t *)ctx->gcm_tmp; /* authentication tag */ memset((uint8_t *)ctx->gcm_tmp, 0, block_size); memcpy((uint8_t *)ctx->gcm_tmp, datap, ctx->gcm_remainder_len); /* add ciphertext to the hash */ GHASH(ctx, ctx->gcm_tmp, ctx->gcm_ghash, gcm_impl_get_ops()); /* decrypt remaining ciphertext */ encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb, counterp); /* XOR with counter block */ for (i = 0; i < ctx->gcm_remainder_len; i++) { outp[i] = datap[i] ^ counterp[i]; } } int gcm_mode_decrypt_contiguous_blocks(gcm_ctx_t *ctx, char *data, size_t length, crypto_data_t *out, size_t block_size, int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), void (*copy_block)(uint8_t *, uint8_t *), void (*xor_block)(uint8_t *, uint8_t *)) { (void) out, (void) block_size, (void) encrypt_block, (void) copy_block, (void) xor_block; size_t new_len; uint8_t *new; /* * Copy contiguous ciphertext input blocks to plaintext buffer. * Ciphertext will be decrypted in the final. */ if (length > 0) { new_len = ctx->gcm_pt_buf_len + length; new = vmem_alloc(new_len, KM_SLEEP); if (new == NULL) { vmem_free(ctx->gcm_pt_buf, ctx->gcm_pt_buf_len); ctx->gcm_pt_buf = NULL; return (CRYPTO_HOST_MEMORY); } if (ctx->gcm_pt_buf != NULL) { memcpy(new, ctx->gcm_pt_buf, ctx->gcm_pt_buf_len); vmem_free(ctx->gcm_pt_buf, ctx->gcm_pt_buf_len); } else { ASSERT0(ctx->gcm_pt_buf_len); } ctx->gcm_pt_buf = new; ctx->gcm_pt_buf_len = new_len; memcpy(&ctx->gcm_pt_buf[ctx->gcm_processed_data_len], data, length); ctx->gcm_processed_data_len += length; } ctx->gcm_remainder_len = 0; return (CRYPTO_SUCCESS); } int gcm_decrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size, int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), void (*xor_block)(uint8_t *, uint8_t *)) { #ifdef CAN_USE_GCM_ASM if (ctx->gcm_use_avx == B_TRUE) return (gcm_decrypt_final_avx(ctx, out, block_size)); #endif const gcm_impl_ops_t *gops; size_t pt_len; size_t remainder; uint8_t *ghash; uint8_t *blockp; uint8_t *cbp; uint64_t counter; uint64_t counter_mask = ntohll(0x00000000ffffffffULL); int processed = 0, rv; ASSERT(ctx->gcm_processed_data_len == ctx->gcm_pt_buf_len); gops = gcm_impl_get_ops(); pt_len = ctx->gcm_processed_data_len - ctx->gcm_tag_len; ghash = (uint8_t *)ctx->gcm_ghash; blockp = ctx->gcm_pt_buf; remainder = pt_len; while (remainder > 0) { /* Incomplete last block */ if (remainder < block_size) { memcpy(ctx->gcm_remainder, blockp, remainder); ctx->gcm_remainder_len = remainder; /* * not expecting anymore ciphertext, just * compute plaintext for the remaining input */ gcm_decrypt_incomplete_block(ctx, block_size, processed, encrypt_block, xor_block); ctx->gcm_remainder_len = 0; goto out; } /* add ciphertext to the hash */ GHASH(ctx, blockp, ghash, gops); /* * Increment counter. * Counter bits are confined to the bottom 32 bits */ counter = ntohll(ctx->gcm_cb[1] & counter_mask); counter = htonll(counter + 1); counter &= counter_mask; ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter; cbp = (uint8_t *)ctx->gcm_tmp; encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb, cbp); /* XOR with ciphertext */ xor_block(cbp, blockp); processed += block_size; blockp += block_size; remainder -= block_size; } out: ctx->gcm_len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(pt_len)); GHASH(ctx, ctx->gcm_len_a_len_c, ghash, gops); encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_J0, (uint8_t *)ctx->gcm_J0); xor_block((uint8_t *)ctx->gcm_J0, ghash); /* compare the input authentication tag with what we calculated */ if (memcmp(&ctx->gcm_pt_buf[pt_len], ghash, ctx->gcm_tag_len)) { /* They don't match */ return (CRYPTO_INVALID_MAC); } else { rv = crypto_put_output_data(ctx->gcm_pt_buf, out, pt_len); if (rv != CRYPTO_SUCCESS) return (rv); out->cd_offset += pt_len; } return (CRYPTO_SUCCESS); } static int gcm_validate_args(CK_AES_GCM_PARAMS *gcm_param) { size_t tag_len; /* * Check the length of the authentication tag (in bits). */ tag_len = gcm_param->ulTagBits; switch (tag_len) { case 32: case 64: case 96: case 104: case 112: case 120: case 128: break; default: return (CRYPTO_MECHANISM_PARAM_INVALID); } if (gcm_param->ulIvLen == 0) return (CRYPTO_MECHANISM_PARAM_INVALID); return (CRYPTO_SUCCESS); } static void gcm_format_initial_blocks(uchar_t *iv, ulong_t iv_len, gcm_ctx_t *ctx, size_t block_size, void (*copy_block)(uint8_t *, uint8_t *), void (*xor_block)(uint8_t *, uint8_t *)) { const gcm_impl_ops_t *gops; uint8_t *cb; ulong_t remainder = iv_len; ulong_t processed = 0; uint8_t *datap, *ghash; uint64_t len_a_len_c[2]; gops = gcm_impl_get_ops(); ghash = (uint8_t *)ctx->gcm_ghash; cb = (uint8_t *)ctx->gcm_cb; if (iv_len == 12) { memcpy(cb, iv, 12); cb[12] = 0; cb[13] = 0; cb[14] = 0; cb[15] = 1; /* J0 will be used again in the final */ copy_block(cb, (uint8_t *)ctx->gcm_J0); } else { /* GHASH the IV */ do { if (remainder < block_size) { memset(cb, 0, block_size); memcpy(cb, &(iv[processed]), remainder); datap = (uint8_t *)cb; remainder = 0; } else { datap = (uint8_t *)(&(iv[processed])); processed += block_size; remainder -= block_size; } GHASH(ctx, datap, ghash, gops); } while (remainder > 0); len_a_len_c[0] = 0; len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(iv_len)); GHASH(ctx, len_a_len_c, ctx->gcm_J0, gops); /* J0 will be used again in the final */ copy_block((uint8_t *)ctx->gcm_J0, (uint8_t *)cb); } } static int gcm_init(gcm_ctx_t *ctx, unsigned char *iv, size_t iv_len, unsigned char *auth_data, size_t auth_data_len, size_t block_size, int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), void (*copy_block)(uint8_t *, uint8_t *), void (*xor_block)(uint8_t *, uint8_t *)) { const gcm_impl_ops_t *gops; uint8_t *ghash, *datap, *authp; size_t remainder, processed; /* encrypt zero block to get subkey H */ memset(ctx->gcm_H, 0, sizeof (ctx->gcm_H)); encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_H, (uint8_t *)ctx->gcm_H); gcm_format_initial_blocks(iv, iv_len, ctx, block_size, copy_block, xor_block); gops = gcm_impl_get_ops(); authp = (uint8_t *)ctx->gcm_tmp; ghash = (uint8_t *)ctx->gcm_ghash; memset(authp, 0, block_size); memset(ghash, 0, block_size); processed = 0; remainder = auth_data_len; do { if (remainder < block_size) { /* * There's not a block full of data, pad rest of * buffer with zero */ if (auth_data != NULL) { memset(authp, 0, block_size); memcpy(authp, &(auth_data[processed]), remainder); } else { ASSERT0(remainder); } datap = (uint8_t *)authp; remainder = 0; } else { datap = (uint8_t *)(&(auth_data[processed])); processed += block_size; remainder -= block_size; } /* add auth data to the hash */ GHASH(ctx, datap, ghash, gops); } while (remainder > 0); return (CRYPTO_SUCCESS); } /* * The following function is called at encrypt or decrypt init time * for AES GCM mode. * * Init the GCM context struct. Handle the cycle and avx implementations here. */ int gcm_init_ctx(gcm_ctx_t *gcm_ctx, char *param, size_t block_size, int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), void (*copy_block)(uint8_t *, uint8_t *), void (*xor_block)(uint8_t *, uint8_t *)) { int rv; CK_AES_GCM_PARAMS *gcm_param; if (param != NULL) { gcm_param = (CK_AES_GCM_PARAMS *)(void *)param; if ((rv = gcm_validate_args(gcm_param)) != 0) { return (rv); } gcm_ctx->gcm_tag_len = gcm_param->ulTagBits; gcm_ctx->gcm_tag_len >>= 3; gcm_ctx->gcm_processed_data_len = 0; /* these values are in bits */ gcm_ctx->gcm_len_a_len_c[0] = htonll(CRYPTO_BYTES2BITS(gcm_param->ulAADLen)); rv = CRYPTO_SUCCESS; gcm_ctx->gcm_flags |= GCM_MODE; } else { return (CRYPTO_MECHANISM_PARAM_INVALID); } #ifdef CAN_USE_GCM_ASM if (GCM_IMPL_READ(icp_gcm_impl) != IMPL_CYCLE) { gcm_ctx->gcm_use_avx = GCM_IMPL_USE_AVX; } else { /* * Handle the "cycle" implementation by creating avx and * non-avx contexts alternately. */ gcm_ctx->gcm_use_avx = gcm_toggle_avx(); /* * We don't handle byte swapped key schedules in the avx * code path. */ aes_key_t *ks = (aes_key_t *)gcm_ctx->gcm_keysched; if (ks->ops->needs_byteswap == B_TRUE) { gcm_ctx->gcm_use_avx = B_FALSE; } /* Use the MOVBE and the BSWAP variants alternately. */ if (gcm_ctx->gcm_use_avx == B_TRUE && zfs_movbe_available() == B_TRUE) { (void) atomic_toggle_boolean_nv( (volatile boolean_t *)&gcm_avx_can_use_movbe); } } /* Allocate Htab memory as needed. */ if (gcm_ctx->gcm_use_avx == B_TRUE) { size_t htab_len = gcm_simd_get_htab_size(gcm_ctx->gcm_use_avx); if (htab_len == 0) { return (CRYPTO_MECHANISM_PARAM_INVALID); } gcm_ctx->gcm_htab_len = htab_len; gcm_ctx->gcm_Htable = (uint64_t *)kmem_alloc(htab_len, KM_SLEEP); if (gcm_ctx->gcm_Htable == NULL) { return (CRYPTO_HOST_MEMORY); } } /* Avx and non avx context initialization differs from here on. */ if (gcm_ctx->gcm_use_avx == B_FALSE) { #endif /* ifdef CAN_USE_GCM_ASM */ if (gcm_init(gcm_ctx, gcm_param->pIv, gcm_param->ulIvLen, gcm_param->pAAD, gcm_param->ulAADLen, block_size, encrypt_block, copy_block, xor_block) != 0) { rv = CRYPTO_MECHANISM_PARAM_INVALID; } #ifdef CAN_USE_GCM_ASM } else { if (gcm_init_avx(gcm_ctx, gcm_param->pIv, gcm_param->ulIvLen, gcm_param->pAAD, gcm_param->ulAADLen, block_size) != 0) { rv = CRYPTO_MECHANISM_PARAM_INVALID; } } #endif /* ifdef CAN_USE_GCM_ASM */ return (rv); } int gmac_init_ctx(gcm_ctx_t *gcm_ctx, char *param, size_t block_size, int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), void (*copy_block)(uint8_t *, uint8_t *), void (*xor_block)(uint8_t *, uint8_t *)) { int rv; CK_AES_GMAC_PARAMS *gmac_param; if (param != NULL) { gmac_param = (CK_AES_GMAC_PARAMS *)(void *)param; gcm_ctx->gcm_tag_len = CRYPTO_BITS2BYTES(AES_GMAC_TAG_BITS); gcm_ctx->gcm_processed_data_len = 0; /* these values are in bits */ gcm_ctx->gcm_len_a_len_c[0] = htonll(CRYPTO_BYTES2BITS(gmac_param->ulAADLen)); rv = CRYPTO_SUCCESS; gcm_ctx->gcm_flags |= GMAC_MODE; } else { return (CRYPTO_MECHANISM_PARAM_INVALID); } #ifdef CAN_USE_GCM_ASM /* * Handle the "cycle" implementation by creating avx and non avx * contexts alternately. */ if (GCM_IMPL_READ(icp_gcm_impl) != IMPL_CYCLE) { gcm_ctx->gcm_use_avx = GCM_IMPL_USE_AVX; } else { gcm_ctx->gcm_use_avx = gcm_toggle_avx(); } /* We don't handle byte swapped key schedules in the avx code path. */ aes_key_t *ks = (aes_key_t *)gcm_ctx->gcm_keysched; if (ks->ops->needs_byteswap == B_TRUE) { gcm_ctx->gcm_use_avx = B_FALSE; } /* Allocate Htab memory as needed. */ if (gcm_ctx->gcm_use_avx == B_TRUE) { size_t htab_len = gcm_simd_get_htab_size(gcm_ctx->gcm_use_avx); if (htab_len == 0) { return (CRYPTO_MECHANISM_PARAM_INVALID); } gcm_ctx->gcm_htab_len = htab_len; gcm_ctx->gcm_Htable = (uint64_t *)kmem_alloc(htab_len, KM_SLEEP); if (gcm_ctx->gcm_Htable == NULL) { return (CRYPTO_HOST_MEMORY); } } /* Avx and non avx context initialization differs from here on. */ if (gcm_ctx->gcm_use_avx == B_FALSE) { #endif /* ifdef CAN_USE_GCM_ASM */ if (gcm_init(gcm_ctx, gmac_param->pIv, AES_GMAC_IV_LEN, gmac_param->pAAD, gmac_param->ulAADLen, block_size, encrypt_block, copy_block, xor_block) != 0) { rv = CRYPTO_MECHANISM_PARAM_INVALID; } #ifdef CAN_USE_GCM_ASM } else { if (gcm_init_avx(gcm_ctx, gmac_param->pIv, AES_GMAC_IV_LEN, gmac_param->pAAD, gmac_param->ulAADLen, block_size) != 0) { rv = CRYPTO_MECHANISM_PARAM_INVALID; } } #endif /* ifdef CAN_USE_GCM_ASM */ return (rv); } void * gcm_alloc_ctx(int kmflag) { gcm_ctx_t *gcm_ctx; if ((gcm_ctx = kmem_zalloc(sizeof (gcm_ctx_t), kmflag)) == NULL) return (NULL); gcm_ctx->gcm_flags = GCM_MODE; return (gcm_ctx); } void * gmac_alloc_ctx(int kmflag) { gcm_ctx_t *gcm_ctx; if ((gcm_ctx = kmem_zalloc(sizeof (gcm_ctx_t), kmflag)) == NULL) return (NULL); gcm_ctx->gcm_flags = GMAC_MODE; return (gcm_ctx); } /* GCM implementation that contains the fastest methods */ static gcm_impl_ops_t gcm_fastest_impl = { .name = "fastest" }; /* All compiled in implementations */ static const gcm_impl_ops_t *gcm_all_impl[] = { &gcm_generic_impl, #if defined(__x86_64) && defined(HAVE_PCLMULQDQ) &gcm_pclmulqdq_impl, #endif }; /* Indicate that benchmark has been completed */ static boolean_t gcm_impl_initialized = B_FALSE; /* Hold all supported implementations */ static size_t gcm_supp_impl_cnt = 0; static gcm_impl_ops_t *gcm_supp_impl[ARRAY_SIZE(gcm_all_impl)]; /* * Returns the GCM operations for encrypt/decrypt/key setup. When a * SIMD implementation is not allowed in the current context, then * fallback to the fastest generic implementation. */ const gcm_impl_ops_t * gcm_impl_get_ops(void) { if (!kfpu_allowed()) return (&gcm_generic_impl); const gcm_impl_ops_t *ops = NULL; const uint32_t impl = GCM_IMPL_READ(icp_gcm_impl); switch (impl) { case IMPL_FASTEST: ASSERT(gcm_impl_initialized); ops = &gcm_fastest_impl; break; case IMPL_CYCLE: /* Cycle through supported implementations */ ASSERT(gcm_impl_initialized); ASSERT3U(gcm_supp_impl_cnt, >, 0); static size_t cycle_impl_idx = 0; size_t idx = (++cycle_impl_idx) % gcm_supp_impl_cnt; ops = gcm_supp_impl[idx]; break; #ifdef CAN_USE_GCM_ASM case IMPL_AVX: /* * Make sure that we return a valid implementation while * switching to the avx implementation since there still * may be unfinished non-avx contexts around. */ ops = &gcm_generic_impl; break; #endif default: ASSERT3U(impl, <, gcm_supp_impl_cnt); ASSERT3U(gcm_supp_impl_cnt, >, 0); if (impl < ARRAY_SIZE(gcm_all_impl)) ops = gcm_supp_impl[impl]; break; } ASSERT3P(ops, !=, NULL); return (ops); } /* * Initialize all supported implementations. */ void gcm_impl_init(void) { gcm_impl_ops_t *curr_impl; int i, c; /* Move supported implementations into gcm_supp_impls */ for (i = 0, c = 0; i < ARRAY_SIZE(gcm_all_impl); i++) { curr_impl = (gcm_impl_ops_t *)gcm_all_impl[i]; if (curr_impl->is_supported()) gcm_supp_impl[c++] = (gcm_impl_ops_t *)curr_impl; } gcm_supp_impl_cnt = c; /* * Set the fastest implementation given the assumption that the * hardware accelerated version is the fastest. */ #if defined(__x86_64) && defined(HAVE_PCLMULQDQ) if (gcm_pclmulqdq_impl.is_supported()) { memcpy(&gcm_fastest_impl, &gcm_pclmulqdq_impl, sizeof (gcm_fastest_impl)); } else #endif { memcpy(&gcm_fastest_impl, &gcm_generic_impl, sizeof (gcm_fastest_impl)); } strlcpy(gcm_fastest_impl.name, "fastest", GCM_IMPL_NAME_MAX); #ifdef CAN_USE_GCM_ASM /* * Use the avx implementation if it's available and the implementation * hasn't changed from its default value of fastest on module load. */ if (gcm_avx_will_work()) { #ifdef HAVE_MOVBE if (zfs_movbe_available() == B_TRUE) { atomic_swap_32(&gcm_avx_can_use_movbe, B_TRUE); } #endif if (GCM_IMPL_READ(user_sel_impl) == IMPL_FASTEST) { gcm_set_avx(B_TRUE); } } #endif /* Finish initialization */ atomic_swap_32(&icp_gcm_impl, user_sel_impl); gcm_impl_initialized = B_TRUE; } static const struct { const char *name; uint32_t sel; } gcm_impl_opts[] = { { "cycle", IMPL_CYCLE }, { "fastest", IMPL_FASTEST }, #ifdef CAN_USE_GCM_ASM { "avx", IMPL_AVX }, #endif }; /* * Function sets desired gcm implementation. * * If we are called before init(), user preference will be saved in * user_sel_impl, and applied in later init() call. This occurs when module * parameter is specified on module load. Otherwise, directly update * icp_gcm_impl. * * @val Name of gcm implementation to use * @param Unused. */ int gcm_impl_set(const char *val) { int err = -EINVAL; char req_name[GCM_IMPL_NAME_MAX]; uint32_t impl = GCM_IMPL_READ(user_sel_impl); size_t i; /* sanitize input */ i = strnlen(val, GCM_IMPL_NAME_MAX); if (i == 0 || i >= GCM_IMPL_NAME_MAX) return (err); strlcpy(req_name, val, GCM_IMPL_NAME_MAX); while (i > 0 && isspace(req_name[i-1])) i--; req_name[i] = '\0'; /* Check mandatory options */ for (i = 0; i < ARRAY_SIZE(gcm_impl_opts); i++) { #ifdef CAN_USE_GCM_ASM /* Ignore avx implementation if it won't work. */ if (gcm_impl_opts[i].sel == IMPL_AVX && !gcm_avx_will_work()) { continue; } #endif if (strcmp(req_name, gcm_impl_opts[i].name) == 0) { impl = gcm_impl_opts[i].sel; err = 0; break; } } /* check all supported impl if init() was already called */ if (err != 0 && gcm_impl_initialized) { /* check all supported implementations */ for (i = 0; i < gcm_supp_impl_cnt; i++) { if (strcmp(req_name, gcm_supp_impl[i]->name) == 0) { impl = i; err = 0; break; } } } #ifdef CAN_USE_GCM_ASM /* * Use the avx implementation if available and the requested one is * avx or fastest. */ if (gcm_avx_will_work() == B_TRUE && (impl == IMPL_AVX || impl == IMPL_FASTEST)) { gcm_set_avx(B_TRUE); } else { gcm_set_avx(B_FALSE); } #endif if (err == 0) { if (gcm_impl_initialized) atomic_swap_32(&icp_gcm_impl, impl); else atomic_swap_32(&user_sel_impl, impl); } return (err); } #if defined(_KERNEL) && defined(__linux__) static int icp_gcm_impl_set(const char *val, zfs_kernel_param_t *kp) { return (gcm_impl_set(val)); } static int icp_gcm_impl_get(char *buffer, zfs_kernel_param_t *kp) { int i, cnt = 0; char *fmt; const uint32_t impl = GCM_IMPL_READ(icp_gcm_impl); ASSERT(gcm_impl_initialized); /* list mandatory options */ for (i = 0; i < ARRAY_SIZE(gcm_impl_opts); i++) { #ifdef CAN_USE_GCM_ASM /* Ignore avx implementation if it won't work. */ if (gcm_impl_opts[i].sel == IMPL_AVX && !gcm_avx_will_work()) { continue; } #endif fmt = (impl == gcm_impl_opts[i].sel) ? "[%s] " : "%s "; cnt += sprintf(buffer + cnt, fmt, gcm_impl_opts[i].name); } /* list all supported implementations */ for (i = 0; i < gcm_supp_impl_cnt; i++) { fmt = (i == impl) ? "[%s] " : "%s "; cnt += sprintf(buffer + cnt, fmt, gcm_supp_impl[i]->name); } return (cnt); } module_param_call(icp_gcm_impl, icp_gcm_impl_set, icp_gcm_impl_get, NULL, 0644); MODULE_PARM_DESC(icp_gcm_impl, "Select gcm implementation."); #endif /* defined(__KERNEL) */ #ifdef CAN_USE_GCM_ASM #define GCM_BLOCK_LEN 16 /* * The openssl asm routines are 6x aggregated and need that many bytes * at minimum. */ #define GCM_AVX_MIN_DECRYPT_BYTES (GCM_BLOCK_LEN * 6) #define GCM_AVX_MIN_ENCRYPT_BYTES (GCM_BLOCK_LEN * 6 * 3) /* * Ensure the chunk size is reasonable since we are allocating a * GCM_AVX_MAX_CHUNK_SIZEd buffer and disabling preemption and interrupts. */ #define GCM_AVX_MAX_CHUNK_SIZE \ (((128*1024)/GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES) /* Clear the FPU registers since they hold sensitive internal state. */ #define clear_fpu_regs() clear_fpu_regs_avx() #define GHASH_AVX(ctx, in, len) \ gcm_ghash_avx((ctx)->gcm_ghash, (const uint64_t *)(ctx)->gcm_Htable, \ in, len) #define gcm_incr_counter_block(ctx) gcm_incr_counter_block_by(ctx, 1) /* Get the chunk size module parameter. */ #define GCM_CHUNK_SIZE_READ *(volatile uint32_t *) &gcm_avx_chunk_size /* * Module parameter: number of bytes to process at once while owning the FPU. * Rounded down to the next GCM_AVX_MIN_DECRYPT_BYTES byte boundary and is * ensured to be greater or equal than GCM_AVX_MIN_DECRYPT_BYTES. */ static uint32_t gcm_avx_chunk_size = ((32 * 1024) / GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES; extern void clear_fpu_regs_avx(void); extern void gcm_xor_avx(const uint8_t *src, uint8_t *dst); extern void aes_encrypt_intel(const uint32_t rk[], int nr, const uint32_t pt[4], uint32_t ct[4]); extern void gcm_init_htab_avx(uint64_t *Htable, const uint64_t H[2]); extern void gcm_ghash_avx(uint64_t ghash[2], const uint64_t *Htable, const uint8_t *in, size_t len); extern size_t aesni_gcm_encrypt(const uint8_t *, uint8_t *, size_t, const void *, uint64_t *, uint64_t *); extern size_t aesni_gcm_decrypt(const uint8_t *, uint8_t *, size_t, const void *, uint64_t *, uint64_t *); static inline boolean_t gcm_avx_will_work(void) { /* Avx should imply aes-ni and pclmulqdq, but make sure anyhow. */ return (kfpu_allowed() && zfs_avx_available() && zfs_aes_available() && zfs_pclmulqdq_available()); } static inline void gcm_set_avx(boolean_t val) { if (gcm_avx_will_work() == B_TRUE) { atomic_swap_32(&gcm_use_avx, val); } } static inline boolean_t gcm_toggle_avx(void) { if (gcm_avx_will_work() == B_TRUE) { return (atomic_toggle_boolean_nv(&GCM_IMPL_USE_AVX)); } else { return (B_FALSE); } } static inline size_t gcm_simd_get_htab_size(boolean_t simd_mode) { switch (simd_mode) { case B_TRUE: return (2 * 6 * 2 * sizeof (uint64_t)); default: return (0); } } /* * Clear sensitive data in the context. * * ctx->gcm_remainder may contain a plaintext remainder. ctx->gcm_H and * ctx->gcm_Htable contain the hash sub key which protects authentication. * * Although extremely unlikely, ctx->gcm_J0 and ctx->gcm_tmp could be used for * a known plaintext attack, they consists of the IV and the first and last * counter respectively. If they should be cleared is debatable. */ static inline void gcm_clear_ctx(gcm_ctx_t *ctx) { memset(ctx->gcm_remainder, 0, sizeof (ctx->gcm_remainder)); memset(ctx->gcm_H, 0, sizeof (ctx->gcm_H)); memset(ctx->gcm_J0, 0, sizeof (ctx->gcm_J0)); memset(ctx->gcm_tmp, 0, sizeof (ctx->gcm_tmp)); } /* Increment the GCM counter block by n. */ static inline void gcm_incr_counter_block_by(gcm_ctx_t *ctx, int n) { uint64_t counter_mask = ntohll(0x00000000ffffffffULL); uint64_t counter = ntohll(ctx->gcm_cb[1] & counter_mask); counter = htonll(counter + n); counter &= counter_mask; ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter; } /* * Encrypt multiple blocks of data in GCM mode. * This is done in gcm_avx_chunk_size chunks, utilizing AVX assembler routines * if possible. While processing a chunk the FPU is "locked". */ static int gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *ctx, char *data, size_t length, crypto_data_t *out, size_t block_size) { size_t bleft = length; size_t need = 0; size_t done = 0; uint8_t *datap = (uint8_t *)data; size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ; const aes_key_t *key = ((aes_key_t *)ctx->gcm_keysched); uint64_t *ghash = ctx->gcm_ghash; uint64_t *cb = ctx->gcm_cb; uint8_t *ct_buf = NULL; uint8_t *tmp = (uint8_t *)ctx->gcm_tmp; int rv = CRYPTO_SUCCESS; ASSERT(block_size == GCM_BLOCK_LEN); /* * If the last call left an incomplete block, try to fill * it first. */ if (ctx->gcm_remainder_len > 0) { need = block_size - ctx->gcm_remainder_len; if (length < need) { /* Accumulate bytes here and return. */ memcpy((uint8_t *)ctx->gcm_remainder + ctx->gcm_remainder_len, datap, length); ctx->gcm_remainder_len += length; if (ctx->gcm_copy_to == NULL) { ctx->gcm_copy_to = datap; } return (CRYPTO_SUCCESS); } else { /* Complete incomplete block. */ memcpy((uint8_t *)ctx->gcm_remainder + ctx->gcm_remainder_len, datap, need); ctx->gcm_copy_to = NULL; } } /* Allocate a buffer to encrypt to if there is enough input. */ if (bleft >= GCM_AVX_MIN_ENCRYPT_BYTES) { ct_buf = vmem_alloc(chunk_size, KM_SLEEP); if (ct_buf == NULL) { return (CRYPTO_HOST_MEMORY); } } /* If we completed an incomplete block, encrypt and write it out. */ if (ctx->gcm_remainder_len > 0) { kfpu_begin(); aes_encrypt_intel(key->encr_ks.ks32, key->nr, (const uint32_t *)cb, (uint32_t *)tmp); gcm_xor_avx((const uint8_t *) ctx->gcm_remainder, tmp); GHASH_AVX(ctx, tmp, block_size); clear_fpu_regs(); kfpu_end(); rv = crypto_put_output_data(tmp, out, block_size); out->cd_offset += block_size; gcm_incr_counter_block(ctx); ctx->gcm_processed_data_len += block_size; bleft -= need; datap += need; ctx->gcm_remainder_len = 0; } /* Do the bulk encryption in chunk_size blocks. */ for (; bleft >= chunk_size; bleft -= chunk_size) { kfpu_begin(); done = aesni_gcm_encrypt( datap, ct_buf, chunk_size, key, cb, ghash); clear_fpu_regs(); kfpu_end(); if (done != chunk_size) { rv = CRYPTO_FAILED; goto out_nofpu; } rv = crypto_put_output_data(ct_buf, out, chunk_size); if (rv != CRYPTO_SUCCESS) { goto out_nofpu; } out->cd_offset += chunk_size; datap += chunk_size; ctx->gcm_processed_data_len += chunk_size; } /* Check if we are already done. */ if (bleft == 0) { goto out_nofpu; } /* Bulk encrypt the remaining data. */ kfpu_begin(); if (bleft >= GCM_AVX_MIN_ENCRYPT_BYTES) { done = aesni_gcm_encrypt(datap, ct_buf, bleft, key, cb, ghash); if (done == 0) { rv = CRYPTO_FAILED; goto out; } rv = crypto_put_output_data(ct_buf, out, done); if (rv != CRYPTO_SUCCESS) { goto out; } out->cd_offset += done; ctx->gcm_processed_data_len += done; datap += done; bleft -= done; } /* Less than GCM_AVX_MIN_ENCRYPT_BYTES remain, operate on blocks. */ while (bleft > 0) { if (bleft < block_size) { memcpy(ctx->gcm_remainder, datap, bleft); ctx->gcm_remainder_len = bleft; ctx->gcm_copy_to = datap; goto out; } /* Encrypt, hash and write out. */ aes_encrypt_intel(key->encr_ks.ks32, key->nr, (const uint32_t *)cb, (uint32_t *)tmp); gcm_xor_avx(datap, tmp); GHASH_AVX(ctx, tmp, block_size); rv = crypto_put_output_data(tmp, out, block_size); if (rv != CRYPTO_SUCCESS) { goto out; } out->cd_offset += block_size; gcm_incr_counter_block(ctx); ctx->gcm_processed_data_len += block_size; datap += block_size; bleft -= block_size; } out: clear_fpu_regs(); kfpu_end(); out_nofpu: if (ct_buf != NULL) { vmem_free(ct_buf, chunk_size); } return (rv); } /* * Finalize the encryption: Zero fill, encrypt, hash and write out an eventual * incomplete last block. Encrypt the ICB. Calculate the tag and write it out. */ static int gcm_encrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size) { uint8_t *ghash = (uint8_t *)ctx->gcm_ghash; uint32_t *J0 = (uint32_t *)ctx->gcm_J0; uint8_t *remainder = (uint8_t *)ctx->gcm_remainder; size_t rem_len = ctx->gcm_remainder_len; const void *keysched = ((aes_key_t *)ctx->gcm_keysched)->encr_ks.ks32; int aes_rounds = ((aes_key_t *)keysched)->nr; int rv; ASSERT(block_size == GCM_BLOCK_LEN); if (out->cd_length < (rem_len + ctx->gcm_tag_len)) { return (CRYPTO_DATA_LEN_RANGE); } kfpu_begin(); /* Pad last incomplete block with zeros, encrypt and hash. */ if (rem_len > 0) { uint8_t *tmp = (uint8_t *)ctx->gcm_tmp; const uint32_t *cb = (uint32_t *)ctx->gcm_cb; aes_encrypt_intel(keysched, aes_rounds, cb, (uint32_t *)tmp); memset(remainder + rem_len, 0, block_size - rem_len); for (int i = 0; i < rem_len; i++) { remainder[i] ^= tmp[i]; } GHASH_AVX(ctx, remainder, block_size); ctx->gcm_processed_data_len += rem_len; /* No need to increment counter_block, it's the last block. */ } /* Finish tag. */ ctx->gcm_len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(ctx->gcm_processed_data_len)); GHASH_AVX(ctx, (const uint8_t *)ctx->gcm_len_a_len_c, block_size); aes_encrypt_intel(keysched, aes_rounds, J0, J0); gcm_xor_avx((uint8_t *)J0, ghash); clear_fpu_regs(); kfpu_end(); /* Output remainder. */ if (rem_len > 0) { rv = crypto_put_output_data(remainder, out, rem_len); if (rv != CRYPTO_SUCCESS) return (rv); } out->cd_offset += rem_len; ctx->gcm_remainder_len = 0; rv = crypto_put_output_data(ghash, out, ctx->gcm_tag_len); if (rv != CRYPTO_SUCCESS) return (rv); out->cd_offset += ctx->gcm_tag_len; /* Clear sensitive data in the context before returning. */ gcm_clear_ctx(ctx); return (CRYPTO_SUCCESS); } /* * Finalize decryption: We just have accumulated crypto text, so now we * decrypt it here inplace. */ static int gcm_decrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size) { ASSERT3U(ctx->gcm_processed_data_len, ==, ctx->gcm_pt_buf_len); ASSERT3U(block_size, ==, 16); size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ; size_t pt_len = ctx->gcm_processed_data_len - ctx->gcm_tag_len; uint8_t *datap = ctx->gcm_pt_buf; const aes_key_t *key = ((aes_key_t *)ctx->gcm_keysched); uint32_t *cb = (uint32_t *)ctx->gcm_cb; uint64_t *ghash = ctx->gcm_ghash; uint32_t *tmp = (uint32_t *)ctx->gcm_tmp; int rv = CRYPTO_SUCCESS; size_t bleft, done; /* * Decrypt in chunks of gcm_avx_chunk_size, which is asserted to be * greater or equal than GCM_AVX_MIN_ENCRYPT_BYTES, and a multiple of * GCM_AVX_MIN_DECRYPT_BYTES. */ for (bleft = pt_len; bleft >= chunk_size; bleft -= chunk_size) { kfpu_begin(); done = aesni_gcm_decrypt(datap, datap, chunk_size, (const void *)key, ctx->gcm_cb, ghash); clear_fpu_regs(); kfpu_end(); if (done != chunk_size) { return (CRYPTO_FAILED); } datap += done; } /* Decrypt remainder, which is less than chunk size, in one go. */ kfpu_begin(); if (bleft >= GCM_AVX_MIN_DECRYPT_BYTES) { done = aesni_gcm_decrypt(datap, datap, bleft, (const void *)key, ctx->gcm_cb, ghash); if (done == 0) { clear_fpu_regs(); kfpu_end(); return (CRYPTO_FAILED); } datap += done; bleft -= done; } ASSERT(bleft < GCM_AVX_MIN_DECRYPT_BYTES); /* * Now less than GCM_AVX_MIN_DECRYPT_BYTES bytes remain, * decrypt them block by block. */ while (bleft > 0) { /* Incomplete last block. */ if (bleft < block_size) { uint8_t *lastb = (uint8_t *)ctx->gcm_remainder; memset(lastb, 0, block_size); memcpy(lastb, datap, bleft); /* The GCM processing. */ GHASH_AVX(ctx, lastb, block_size); aes_encrypt_intel(key->encr_ks.ks32, key->nr, cb, tmp); for (size_t i = 0; i < bleft; i++) { datap[i] = lastb[i] ^ ((uint8_t *)tmp)[i]; } break; } /* The GCM processing. */ GHASH_AVX(ctx, datap, block_size); aes_encrypt_intel(key->encr_ks.ks32, key->nr, cb, tmp); gcm_xor_avx((uint8_t *)tmp, datap); gcm_incr_counter_block(ctx); datap += block_size; bleft -= block_size; } if (rv != CRYPTO_SUCCESS) { clear_fpu_regs(); kfpu_end(); return (rv); } /* Decryption done, finish the tag. */ ctx->gcm_len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(pt_len)); GHASH_AVX(ctx, (uint8_t *)ctx->gcm_len_a_len_c, block_size); aes_encrypt_intel(key->encr_ks.ks32, key->nr, (uint32_t *)ctx->gcm_J0, (uint32_t *)ctx->gcm_J0); gcm_xor_avx((uint8_t *)ctx->gcm_J0, (uint8_t *)ghash); /* We are done with the FPU, restore its state. */ clear_fpu_regs(); kfpu_end(); /* Compare the input authentication tag with what we calculated. */ if (memcmp(&ctx->gcm_pt_buf[pt_len], ghash, ctx->gcm_tag_len)) { /* They don't match. */ return (CRYPTO_INVALID_MAC); } rv = crypto_put_output_data(ctx->gcm_pt_buf, out, pt_len); if (rv != CRYPTO_SUCCESS) { return (rv); } out->cd_offset += pt_len; gcm_clear_ctx(ctx); return (CRYPTO_SUCCESS); } /* * Initialize the GCM params H, Htabtle and the counter block. Save the * initial counter block. */ static int gcm_init_avx(gcm_ctx_t *ctx, unsigned char *iv, size_t iv_len, unsigned char *auth_data, size_t auth_data_len, size_t block_size) { uint8_t *cb = (uint8_t *)ctx->gcm_cb; uint64_t *H = ctx->gcm_H; const void *keysched = ((aes_key_t *)ctx->gcm_keysched)->encr_ks.ks32; int aes_rounds = ((aes_key_t *)ctx->gcm_keysched)->nr; uint8_t *datap = auth_data; size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ; size_t bleft; ASSERT(block_size == GCM_BLOCK_LEN); /* Init H (encrypt zero block) and create the initial counter block. */ memset(ctx->gcm_ghash, 0, sizeof (ctx->gcm_ghash)); memset(H, 0, sizeof (ctx->gcm_H)); kfpu_begin(); aes_encrypt_intel(keysched, aes_rounds, (const uint32_t *)H, (uint32_t *)H); gcm_init_htab_avx(ctx->gcm_Htable, H); if (iv_len == 12) { memcpy(cb, iv, 12); cb[12] = 0; cb[13] = 0; cb[14] = 0; cb[15] = 1; /* We need the ICB later. */ memcpy(ctx->gcm_J0, cb, sizeof (ctx->gcm_J0)); } else { /* * Most consumers use 12 byte IVs, so it's OK to use the * original routines for other IV sizes, just avoid nesting * kfpu_begin calls. */ clear_fpu_regs(); kfpu_end(); gcm_format_initial_blocks(iv, iv_len, ctx, block_size, aes_copy_block, aes_xor_block); kfpu_begin(); } /* Openssl post increments the counter, adjust for that. */ gcm_incr_counter_block(ctx); /* Ghash AAD in chunk_size blocks. */ for (bleft = auth_data_len; bleft >= chunk_size; bleft -= chunk_size) { GHASH_AVX(ctx, datap, chunk_size); datap += chunk_size; clear_fpu_regs(); kfpu_end(); kfpu_begin(); } /* Ghash the remainder and handle possible incomplete GCM block. */ if (bleft > 0) { size_t incomp = bleft % block_size; bleft -= incomp; if (bleft > 0) { GHASH_AVX(ctx, datap, bleft); datap += bleft; } if (incomp > 0) { /* Zero pad and hash incomplete last block. */ uint8_t *authp = (uint8_t *)ctx->gcm_tmp; memset(authp, 0, block_size); memcpy(authp, datap, incomp); GHASH_AVX(ctx, authp, block_size); } } clear_fpu_regs(); kfpu_end(); return (CRYPTO_SUCCESS); } #if defined(_KERNEL) static int icp_gcm_avx_set_chunk_size(const char *buf, zfs_kernel_param_t *kp) { unsigned long val; char val_rounded[16]; int error = 0; error = kstrtoul(buf, 0, &val); if (error) return (error); val = (val / GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES; if (val < GCM_AVX_MIN_ENCRYPT_BYTES || val > GCM_AVX_MAX_CHUNK_SIZE) return (-EINVAL); snprintf(val_rounded, 16, "%u", (uint32_t)val); error = param_set_uint(val_rounded, kp); return (error); } module_param_call(icp_gcm_avx_chunk_size, icp_gcm_avx_set_chunk_size, param_get_uint, &gcm_avx_chunk_size, 0644); MODULE_PARM_DESC(icp_gcm_avx_chunk_size, "How many bytes to process while owning the FPU"); #endif /* defined(__KERNEL) */ #endif /* ifdef CAN_USE_GCM_ASM */ diff --git a/module/lua/ldo.c b/module/lua/ldo.c index 24677596de12..6bef80514ce2 100644 --- a/module/lua/ldo.c +++ b/module/lua/ldo.c @@ -1,758 +1,758 @@ /* ** $Id: ldo.c,v 2.108.1.3 2013/11/08 18:22:50 roberto Exp $ ** Stack and Call structure of Lua ** See Copyright Notice in lua.h */ #define ldo_c #define LUA_CORE #include #include "lapi.h" #include "ldebug.h" #include "ldo.h" #include "lfunc.h" #include "lgc.h" #include "lmem.h" #include "lobject.h" #include "lopcodes.h" #include "lparser.h" #include "lstate.h" #include "lstring.h" #include "ltable.h" #include "ltm.h" #include "lvm.h" #include "lzio.h" /* Return the number of bytes available on the stack. */ #if defined (_KERNEL) && defined(__linux__) #include static intptr_t stack_remaining(void) { intptr_t local; local = (intptr_t)&local - (intptr_t)current->stack; return local; } #elif defined (_KERNEL) && defined(__FreeBSD__) #include static intptr_t stack_remaining(void) { intptr_t local; local = (intptr_t)&local - (intptr_t)curthread->td_kstack; return local; } #else static intptr_t stack_remaining(void) { return INTPTR_MAX; } #endif /* ** {====================================================== ** Error-recovery functions ** ======================================================= */ /* ** LUAI_THROW/LUAI_TRY define how Lua does exception handling. By ** default, Lua handles errors with exceptions when compiling as ** C++ code, with _longjmp/_setjmp when asked to use them, and with ** longjmp/setjmp otherwise. */ #if !defined(LUAI_THROW) #ifdef _KERNEL #ifdef __linux__ #if defined(__i386__) #define JMP_BUF_CNT 6 #elif defined(__x86_64__) #define JMP_BUF_CNT 8 #elif defined(__sparc__) && defined(__arch64__) #define JMP_BUF_CNT 6 #elif defined(__powerpc__) #define JMP_BUF_CNT 26 #elif defined(__aarch64__) #define JMP_BUF_CNT 64 #elif defined(__arm__) #define JMP_BUF_CNT 65 #elif defined(__mips__) #define JMP_BUF_CNT 12 #elif defined(__s390x__) #define JMP_BUF_CNT 18 #elif defined(__riscv) #define JMP_BUF_CNT 64 #else #define JMP_BUF_CNT 1 #endif typedef struct _label_t { long long unsigned val[JMP_BUF_CNT]; } label_t; int setjmp(label_t *) __attribute__ ((__nothrow__)); extern __attribute__((noreturn)) void longjmp(label_t *); #define LUAI_THROW(L,c) longjmp(&(c)->b) #define LUAI_TRY(L,c,a) if (setjmp(&(c)->b) == 0) { a } #define luai_jmpbuf label_t /* unsupported arches will build but not be able to run lua programs */ #if JMP_BUF_CNT == 1 int setjmp (label_t *buf) { return 1; } void longjmp (label_t * buf) { for (;;); } #endif #else #define LUAI_THROW(L,c) longjmp((c)->b, 1) #define LUAI_TRY(L,c,a) if (setjmp((c)->b) == 0) { a } #define luai_jmpbuf jmp_buf #endif #else /* _KERNEL */ #if defined(__cplusplus) && !defined(LUA_USE_LONGJMP) /* C++ exceptions */ #define LUAI_THROW(L,c) throw(c) #define LUAI_TRY(L,c,a) \ try { a } catch(...) { if ((c)->status == 0) (c)->status = -1; } #define luai_jmpbuf int /* dummy variable */ #elif defined(LUA_USE_ULONGJMP) /* in Unix, try _longjmp/_setjmp (more efficient) */ #define LUAI_THROW(L,c) _longjmp((c)->b, 1) #define LUAI_TRY(L,c,a) if (_setjmp((c)->b) == 0) { a } #define luai_jmpbuf jmp_buf #else /* default handling with long jumps */ #define LUAI_THROW(L,c) longjmp((c)->b, 1) #define LUAI_TRY(L,c,a) if (setjmp((c)->b) == 0) { a } #define luai_jmpbuf jmp_buf #endif #endif /* _KERNEL */ #endif /* LUAI_THROW */ /* chain list of long jump buffers */ struct lua_longjmp { struct lua_longjmp *previous; luai_jmpbuf b; volatile int status; /* error code */ }; static void seterrorobj (lua_State *L, int errcode, StkId oldtop) { switch (errcode) { case LUA_ERRMEM: { /* memory error? */ setsvalue2s(L, oldtop, G(L)->memerrmsg); /* reuse preregistered msg. */ break; } case LUA_ERRERR: { setsvalue2s(L, oldtop, luaS_newliteral(L, "error in error handling")); break; } default: { setobjs2s(L, oldtop, L->top - 1); /* error message on current top */ break; } } L->top = oldtop + 1; } /* * Silence infinite recursion warning which was added to -Wall in gcc 12.1 */ #if defined(HAVE_INFINITE_RECURSION) #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Winfinite-recursion" #endif l_noret luaD_throw (lua_State *L, int errcode) { if (L->errorJmp) { /* thread has an error handler? */ L->errorJmp->status = errcode; /* set status */ LUAI_THROW(L, L->errorJmp); /* jump to it */ } else { /* thread has no error handler */ L->status = cast_byte(errcode); /* mark it as dead */ if (G(L)->mainthread->errorJmp) { /* main thread has a handler? */ setobjs2s(L, G(L)->mainthread->top++, L->top - 1); /* copy error obj. */ luaD_throw(G(L)->mainthread, errcode); /* re-throw in main thread */ } else { /* no handler at all; abort */ if (G(L)->panic) { /* panic function? */ lua_unlock(L); G(L)->panic(L); /* call it (last chance to jump out) */ } panic("no error handler"); } } } #if defined(HAVE_INFINITE_RECURSION) #pragma GCC diagnostic pop #endif int luaD_rawrunprotected (lua_State *L, Pfunc f, void *ud) { unsigned short oldnCcalls = L->nCcalls; struct lua_longjmp lj; lj.status = LUA_OK; lj.previous = L->errorJmp; /* chain new error handler */ L->errorJmp = &lj; LUAI_TRY(L, &lj, (*f)(L, ud); ); L->errorJmp = lj.previous; /* restore old error handler */ L->nCcalls = oldnCcalls; return lj.status; } /* }====================================================== */ static void correctstack (lua_State *L, TValue *oldstack) { CallInfo *ci; GCObject *up; L->top = (L->top - oldstack) + L->stack; for (up = L->openupval; up != NULL; up = up->gch.next) gco2uv(up)->v = (gco2uv(up)->v - oldstack) + L->stack; for (ci = L->ci; ci != NULL; ci = ci->previous) { ci->top = (ci->top - oldstack) + L->stack; ci->func = (ci->func - oldstack) + L->stack; if (isLua(ci)) ci->u.l.base = (ci->u.l.base - oldstack) + L->stack; } } /* some space for error handling */ #define ERRORSTACKSIZE (LUAI_MAXSTACK + 200) void luaD_reallocstack (lua_State *L, int newsize) { TValue *oldstack = L->stack; int lim = L->stacksize; lua_assert(newsize <= LUAI_MAXSTACK || newsize == ERRORSTACKSIZE); lua_assert(L->stack_last - L->stack == L->stacksize - EXTRA_STACK); luaM_reallocvector(L, L->stack, L->stacksize, newsize, TValue); for (; lim < newsize; lim++) setnilvalue(L->stack + lim); /* erase new segment */ L->stacksize = newsize; L->stack_last = L->stack + newsize - EXTRA_STACK; correctstack(L, oldstack); } void luaD_growstack (lua_State *L, int n) { int size = L->stacksize; if (size > LUAI_MAXSTACK) /* error after extra size? */ luaD_throw(L, LUA_ERRERR); else { int needed = cast_int(L->top - L->stack) + n + EXTRA_STACK; int newsize = 2 * size; if (newsize > LUAI_MAXSTACK) newsize = LUAI_MAXSTACK; if (newsize < needed) newsize = needed; if (newsize > LUAI_MAXSTACK) { /* stack overflow? */ luaD_reallocstack(L, ERRORSTACKSIZE); luaG_runerror(L, "stack overflow"); } else luaD_reallocstack(L, newsize); } } static int stackinuse (lua_State *L) { CallInfo *ci; StkId lim = L->top; for (ci = L->ci; ci != NULL; ci = ci->previous) { lua_assert(ci->top <= L->stack_last); if (lim < ci->top) lim = ci->top; } return cast_int(lim - L->stack) + 1; /* part of stack in use */ } void luaD_shrinkstack (lua_State *L) { int inuse = stackinuse(L); int goodsize = inuse + (inuse / 8) + 2*EXTRA_STACK; if (goodsize > LUAI_MAXSTACK) goodsize = LUAI_MAXSTACK; if (inuse > LUAI_MAXSTACK || /* handling stack overflow? */ goodsize >= L->stacksize) /* would grow instead of shrink? */ condmovestack(L); /* don't change stack (change only for debugging) */ else luaD_reallocstack(L, goodsize); /* shrink it */ } void luaD_hook (lua_State *L, int event, int line) { lua_Hook hook = L->hook; if (hook && L->allowhook) { CallInfo *ci = L->ci; ptrdiff_t top = savestack(L, L->top); ptrdiff_t ci_top = savestack(L, ci->top); lua_Debug ar; ar.event = event; ar.currentline = line; ar.i_ci = ci; luaD_checkstack(L, LUA_MINSTACK); /* ensure minimum stack size */ ci->top = L->top + LUA_MINSTACK; lua_assert(ci->top <= L->stack_last); L->allowhook = 0; /* cannot call hooks inside a hook */ ci->callstatus |= CIST_HOOKED; lua_unlock(L); (*hook)(L, &ar); lua_lock(L); lua_assert(!L->allowhook); L->allowhook = 1; ci->top = restorestack(L, ci_top); L->top = restorestack(L, top); ci->callstatus &= ~CIST_HOOKED; } } static void callhook (lua_State *L, CallInfo *ci) { int hook = LUA_HOOKCALL; ci->u.l.savedpc++; /* hooks assume 'pc' is already incremented */ if (isLua(ci->previous) && GET_OPCODE(*(ci->previous->u.l.savedpc - 1)) == OP_TAILCALL) { ci->callstatus |= CIST_TAIL; hook = LUA_HOOKTAILCALL; } luaD_hook(L, hook, -1); ci->u.l.savedpc--; /* correct 'pc' */ } static StkId adjust_varargs (lua_State *L, Proto *p, int actual) { int i; int nfixargs = p->numparams; StkId base, fixed; lua_assert(actual >= nfixargs); /* move fixed parameters to final position */ luaD_checkstack(L, p->maxstacksize); /* check again for new 'base' */ fixed = L->top - actual; /* first fixed argument */ base = L->top; /* final position of first argument */ for (i=0; itop++, fixed + i); setnilvalue(fixed + i); } return base; } static StkId tryfuncTM (lua_State *L, StkId func) { const TValue *tm = luaT_gettmbyobj(L, func, TM_CALL); StkId p; ptrdiff_t funcr = savestack(L, func); if (!ttisfunction(tm)) luaG_typeerror(L, func, "call"); /* Open a hole inside the stack at `func' */ for (p = L->top; p > func; p--) setobjs2s(L, p, p-1); incr_top(L); func = restorestack(L, funcr); /* previous call may change stack */ setobj2s(L, func, tm); /* tag method is the new function to be called */ return func; } #define next_ci(L) (L->ci = (L->ci->next ? L->ci->next : luaE_extendCI(L))) /* ** returns true if function has been executed (C function) */ int luaD_precall (lua_State *L, StkId func, int nresults) { lua_CFunction f; CallInfo *ci; int n; /* number of arguments (Lua) or returns (C) */ ptrdiff_t funcr = savestack(L, func); switch (ttype(func)) { case LUA_TLCF: /* light C function */ f = fvalue(func); goto Cfunc; case LUA_TCCL: { /* C closure */ f = clCvalue(func)->f; Cfunc: luaD_checkstack(L, LUA_MINSTACK); /* ensure minimum stack size */ ci = next_ci(L); /* now 'enter' new function */ ci->nresults = nresults; ci->func = restorestack(L, funcr); ci->top = L->top + LUA_MINSTACK; lua_assert(ci->top <= L->stack_last); ci->callstatus = 0; luaC_checkGC(L); /* stack grow uses memory */ if (L->hookmask & LUA_MASKCALL) luaD_hook(L, LUA_HOOKCALL, -1); lua_unlock(L); n = (*f)(L); /* do the actual call */ lua_lock(L); api_checknelems(L, n); luaD_poscall(L, L->top - n); return 1; } case LUA_TLCL: { /* Lua function: prepare its call */ StkId base; Proto *p = clLvalue(func)->p; n = cast_int(L->top - func) - 1; /* number of real arguments */ luaD_checkstack(L, p->maxstacksize + p->numparams); for (; n < p->numparams; n++) setnilvalue(L->top++); /* complete missing arguments */ if (!p->is_vararg) { func = restorestack(L, funcr); base = func + 1; } else { base = adjust_varargs(L, p, n); func = restorestack(L, funcr); /* previous call can change stack */ } ci = next_ci(L); /* now 'enter' new function */ ci->nresults = nresults; ci->func = func; ci->u.l.base = base; ci->top = base + p->maxstacksize; lua_assert(ci->top <= L->stack_last); ci->u.l.savedpc = p->code; /* starting point */ ci->callstatus = CIST_LUA; L->top = ci->top; luaC_checkGC(L); /* stack grow uses memory */ if (L->hookmask & LUA_MASKCALL) callhook(L, ci); return 0; } default: { /* not a function */ func = tryfuncTM(L, func); /* retry with 'function' tag method */ return luaD_precall(L, func, nresults); /* now it must be a function */ } } } int luaD_poscall (lua_State *L, StkId firstResult) { StkId res; int wanted, i; CallInfo *ci = L->ci; if (L->hookmask & (LUA_MASKRET | LUA_MASKLINE)) { if (L->hookmask & LUA_MASKRET) { ptrdiff_t fr = savestack(L, firstResult); /* hook may change stack */ luaD_hook(L, LUA_HOOKRET, -1); firstResult = restorestack(L, fr); } L->oldpc = ci->previous->u.l.savedpc; /* 'oldpc' for caller function */ } res = ci->func; /* res == final position of 1st result */ wanted = ci->nresults; - L->ci = ci = ci->previous; /* back to caller */ + L->ci = ci->previous; /* back to caller */ /* move results to correct place */ for (i = wanted; i != 0 && firstResult < L->top; i--) setobjs2s(L, res++, firstResult++); while (i-- > 0) setnilvalue(res++); L->top = res; return (wanted - LUA_MULTRET); /* 0 iff wanted == LUA_MULTRET */ } /* ** Call a function (C or Lua). The function to be called is at *func. ** The arguments are on the stack, right after the function. ** When returns, all the results are on the stack, starting at the original ** function position. */ void luaD_call (lua_State *L, StkId func, int nResults, int allowyield) { if (++L->nCcalls >= LUAI_MAXCCALLS) { if (L->nCcalls == LUAI_MAXCCALLS) luaG_runerror(L, "C stack overflow"); else if (L->nCcalls >= (LUAI_MAXCCALLS + (LUAI_MAXCCALLS>>3))) luaD_throw(L, LUA_ERRERR); /* error while handling stack error */ } intptr_t remaining = stack_remaining(); if (L->runerror == 0 && remaining < LUAI_MINCSTACK) luaG_runerror(L, "C stack overflow"); if (L->runerror != 0 && remaining < LUAI_MINCSTACK / 2) luaD_throw(L, LUA_ERRERR); /* error while handling stack error */ if (!allowyield) L->nny++; if (!luaD_precall(L, func, nResults)) /* is a Lua function? */ luaV_execute(L); /* call it */ if (!allowyield) L->nny--; L->nCcalls--; } static void finishCcall (lua_State *L) { CallInfo *ci = L->ci; int n; lua_assert(ci->u.c.k != NULL); /* must have a continuation */ lua_assert(L->nny == 0); if (ci->callstatus & CIST_YPCALL) { /* was inside a pcall? */ ci->callstatus &= ~CIST_YPCALL; /* finish 'lua_pcall' */ L->errfunc = ci->u.c.old_errfunc; } /* finish 'lua_callk'/'lua_pcall' */ adjustresults(L, ci->nresults); /* call continuation function */ if (!(ci->callstatus & CIST_STAT)) /* no call status? */ ci->u.c.status = LUA_YIELD; /* 'default' status */ lua_assert(ci->u.c.status != LUA_OK); ci->callstatus = (ci->callstatus & ~(CIST_YPCALL | CIST_STAT)) | CIST_YIELDED; lua_unlock(L); n = (*ci->u.c.k)(L); lua_lock(L); api_checknelems(L, n); /* finish 'luaD_precall' */ luaD_poscall(L, L->top - n); } static void unroll (lua_State *L, void *ud) { UNUSED(ud); for (;;) { if (L->ci == &L->base_ci) /* stack is empty? */ return; /* coroutine finished normally */ if (!isLua(L->ci)) /* C function? */ finishCcall(L); else { /* Lua function */ luaV_finishOp(L); /* finish interrupted instruction */ luaV_execute(L); /* execute down to higher C 'boundary' */ } } } /* ** check whether thread has a suspended protected call */ static CallInfo *findpcall (lua_State *L) { CallInfo *ci; for (ci = L->ci; ci != NULL; ci = ci->previous) { /* search for a pcall */ if (ci->callstatus & CIST_YPCALL) return ci; } return NULL; /* no pending pcall */ } static int recover (lua_State *L, int status) { StkId oldtop; CallInfo *ci = findpcall(L); if (ci == NULL) return 0; /* no recovery point */ /* "finish" luaD_pcall */ oldtop = restorestack(L, ci->extra); luaF_close(L, oldtop); seterrorobj(L, status, oldtop); L->ci = ci; L->allowhook = ci->u.c.old_allowhook; L->nny = 0; /* should be zero to be yieldable */ luaD_shrinkstack(L); L->errfunc = ci->u.c.old_errfunc; ci->callstatus |= CIST_STAT; /* call has error status */ ci->u.c.status = status; /* (here it is) */ return 1; /* continue running the coroutine */ } /* ** signal an error in the call to 'resume', not in the execution of the ** coroutine itself. (Such errors should not be handled by any coroutine ** error handler and should not kill the coroutine.) */ static l_noret resume_error (lua_State *L, const char *msg, StkId firstArg) { L->top = firstArg; /* remove args from the stack */ setsvalue2s(L, L->top, luaS_new(L, msg)); /* push error message */ api_incr_top(L); luaD_throw(L, -1); /* jump back to 'lua_resume' */ } /* ** do the work for 'lua_resume' in protected mode */ static void resume_cb (lua_State *L, void *ud) { int nCcalls = L->nCcalls; StkId firstArg = cast(StkId, ud); CallInfo *ci = L->ci; if (nCcalls >= LUAI_MAXCCALLS) resume_error(L, "C stack overflow", firstArg); if (L->status == LUA_OK) { /* may be starting a coroutine */ if (ci != &L->base_ci) /* not in base level? */ resume_error(L, "cannot resume non-suspended coroutine", firstArg); /* coroutine is in base level; start running it */ if (!luaD_precall(L, firstArg - 1, LUA_MULTRET)) /* Lua function? */ luaV_execute(L); /* call it */ } else if (L->status != LUA_YIELD) resume_error(L, "cannot resume dead coroutine", firstArg); else { /* resuming from previous yield */ L->status = LUA_OK; ci->func = restorestack(L, ci->extra); if (isLua(ci)) /* yielded inside a hook? */ luaV_execute(L); /* just continue running Lua code */ else { /* 'common' yield */ if (ci->u.c.k != NULL) { /* does it have a continuation? */ int n; ci->u.c.status = LUA_YIELD; /* 'default' status */ ci->callstatus |= CIST_YIELDED; lua_unlock(L); n = (*ci->u.c.k)(L); /* call continuation */ lua_lock(L); api_checknelems(L, n); firstArg = L->top - n; /* yield results come from continuation */ } luaD_poscall(L, firstArg); /* finish 'luaD_precall' */ } unroll(L, NULL); } lua_assert(nCcalls == L->nCcalls); } LUA_API int lua_resume (lua_State *L, lua_State *from, int nargs) { int status; int oldnny = L->nny; /* save 'nny' */ lua_lock(L); luai_userstateresume(L, nargs); L->nCcalls = (from) ? from->nCcalls + 1 : 1; L->nny = 0; /* allow yields */ api_checknelems(L, (L->status == LUA_OK) ? nargs + 1 : nargs); status = luaD_rawrunprotected(L, resume_cb, L->top - nargs); if (status == -1) /* error calling 'lua_resume'? */ status = LUA_ERRRUN; else { /* yield or regular error */ while (status != LUA_OK && status != LUA_YIELD) { /* error? */ if (recover(L, status)) /* recover point? */ status = luaD_rawrunprotected(L, unroll, NULL); /* run continuation */ else { /* unrecoverable error */ L->status = cast_byte(status); /* mark thread as `dead' */ seterrorobj(L, status, L->top); L->ci->top = L->top; break; } } lua_assert(status == L->status); } L->nny = oldnny; /* restore 'nny' */ L->nCcalls--; lua_assert(L->nCcalls == ((from) ? from->nCcalls : 0)); lua_unlock(L); return status; } LUA_API int lua_yieldk (lua_State *L, int nresults, int ctx, lua_CFunction k) { CallInfo *ci = L->ci; luai_userstateyield(L, nresults); lua_lock(L); api_checknelems(L, nresults); if (L->nny > 0) { if (L != G(L)->mainthread) luaG_runerror(L, "attempt to yield across a C-call boundary"); else luaG_runerror(L, "attempt to yield from outside a coroutine"); } L->status = LUA_YIELD; ci->extra = savestack(L, ci->func); /* save current 'func' */ if (isLua(ci)) { /* inside a hook? */ api_check(L, k == NULL, "hooks cannot continue after yielding"); } else { if ((ci->u.c.k = k) != NULL) /* is there a continuation? */ ci->u.c.ctx = ctx; /* save context */ ci->func = L->top - nresults - 1; /* protect stack below results */ luaD_throw(L, LUA_YIELD); } lua_assert(ci->callstatus & CIST_HOOKED); /* must be inside a hook */ lua_unlock(L); return 0; /* return to 'luaD_hook' */ } int luaD_pcall (lua_State *L, Pfunc func, void *u, ptrdiff_t old_top, ptrdiff_t ef) { int status; CallInfo *old_ci = L->ci; lu_byte old_allowhooks = L->allowhook; unsigned short old_nny = L->nny; ptrdiff_t old_errfunc = L->errfunc; L->errfunc = ef; status = luaD_rawrunprotected(L, func, u); if (status != LUA_OK) { /* an error occurred? */ StkId oldtop = restorestack(L, old_top); luaF_close(L, oldtop); /* close possible pending closures */ seterrorobj(L, status, oldtop); L->ci = old_ci; L->allowhook = old_allowhooks; L->nny = old_nny; luaD_shrinkstack(L); } L->errfunc = old_errfunc; return status; } /* ** Execute a protected parser. */ struct SParser { /* data to `f_parser' */ ZIO *z; Mbuffer buff; /* dynamic structure used by the scanner */ Dyndata dyd; /* dynamic structures used by the parser */ const char *mode; const char *name; }; static void checkmode (lua_State *L, const char *mode, const char *x) { if (mode && strchr(mode, x[0]) == NULL) { luaO_pushfstring(L, "attempt to load a %s chunk (mode is " LUA_QS ")", x, mode); luaD_throw(L, LUA_ERRSYNTAX); } } static void f_parser (lua_State *L, void *ud) { int i; Closure *cl; struct SParser *p = cast(struct SParser *, ud); int c = zgetc(p->z); /* read first character */ lua_assert(c != LUA_SIGNATURE[0]); /* binary not supported */ checkmode(L, p->mode, "text"); cl = luaY_parser(L, p->z, &p->buff, &p->dyd, p->name, c); lua_assert(cl->l.nupvalues == cl->l.p->sizeupvalues); for (i = 0; i < cl->l.nupvalues; i++) { /* initialize upvalues */ UpVal *up = luaF_newupval(L); cl->l.upvals[i] = up; luaC_objbarrier(L, cl, up); } } int luaD_protectedparser (lua_State *L, ZIO *z, const char *name, const char *mode) { struct SParser p; int status; L->nny++; /* cannot yield during parsing */ p.z = z; p.name = name; p.mode = mode; p.dyd.actvar.arr = NULL; p.dyd.actvar.size = 0; p.dyd.gt.arr = NULL; p.dyd.gt.size = 0; p.dyd.label.arr = NULL; p.dyd.label.size = 0; luaZ_initbuffer(L, &p.buff); status = luaD_pcall(L, f_parser, &p, savestack(L, L->top), L->errfunc); luaZ_freebuffer(L, &p.buff); luaM_freearray(L, p.dyd.actvar.arr, p.dyd.actvar.size); luaM_freearray(L, p.dyd.gt.arr, p.dyd.gt.size); luaM_freearray(L, p.dyd.label.arr, p.dyd.label.size); L->nny--; return status; } diff --git a/module/os/freebsd/zfs/zfs_znode.c b/module/os/freebsd/zfs/zfs_znode.c index 6345e9e69d30..192aa748fc13 100644 --- a/module/os/freebsd/zfs/zfs_znode.c +++ b/module/os/freebsd/zfs/zfs_znode.c @@ -1,2112 +1,2111 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2014 by Delphix. All rights reserved. * Copyright (c) 2014 Integros [integros.com] */ /* Portions Copyright 2007 Jeremy Teo */ /* Portions Copyright 2011 Martin Matuska */ #ifdef _KERNEL #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #endif /* _KERNEL */ #include #include #include #include #include #include #include #include #include #include #include "zfs_prop.h" #include "zfs_comutil.h" /* Used by fstat(1). */ SYSCTL_INT(_debug_sizeof, OID_AUTO, znode, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, sizeof (znode_t), "sizeof(znode_t)"); /* * Define ZNODE_STATS to turn on statistic gathering. By default, it is only * turned on when DEBUG is also defined. */ #ifdef ZFS_DEBUG #define ZNODE_STATS #endif /* DEBUG */ #ifdef ZNODE_STATS #define ZNODE_STAT_ADD(stat) ((stat)++) #else #define ZNODE_STAT_ADD(stat) /* nothing */ #endif /* ZNODE_STATS */ /* * Functions needed for userland (ie: libzpool) are not put under * #ifdef_KERNEL; the rest of the functions have dependencies * (such as VFS logic) that will not compile easily in userland. */ #ifdef _KERNEL #if !defined(KMEM_DEBUG) && __FreeBSD_version >= 1300102 #define _ZFS_USE_SMR static uma_zone_t znode_uma_zone; #else static kmem_cache_t *znode_cache = NULL; #endif extern struct vop_vector zfs_vnodeops; extern struct vop_vector zfs_fifoops; extern struct vop_vector zfs_shareops; /* * This callback is invoked when acquiring a RL_WRITER or RL_APPEND lock on * z_rangelock. It will modify the offset and length of the lock to reflect * znode-specific information, and convert RL_APPEND to RL_WRITER. This is * called with the rangelock_t's rl_lock held, which avoids races. */ static void zfs_rangelock_cb(zfs_locked_range_t *new, void *arg) { znode_t *zp = arg; /* * If in append mode, convert to writer and lock starting at the * current end of file. */ if (new->lr_type == RL_APPEND) { new->lr_offset = zp->z_size; new->lr_type = RL_WRITER; } /* * If we need to grow the block size then lock the whole file range. */ uint64_t end_size = MAX(zp->z_size, new->lr_offset + new->lr_length); if (end_size > zp->z_blksz && (!ISP2(zp->z_blksz) || zp->z_blksz < ZTOZSB(zp)->z_max_blksz)) { new->lr_offset = 0; new->lr_length = UINT64_MAX; } } static int zfs_znode_cache_constructor(void *buf, void *arg, int kmflags) { znode_t *zp = buf; POINTER_INVALIDATE(&zp->z_zfsvfs); list_link_init(&zp->z_link_node); mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL); rw_init(&zp->z_xattr_lock, NULL, RW_DEFAULT, NULL); zfs_rangelock_init(&zp->z_rangelock, zfs_rangelock_cb, zp); zp->z_acl_cached = NULL; zp->z_xattr_cached = NULL; zp->z_xattr_parent = 0; zp->z_vnode = NULL; zp->z_sync_writes_cnt = 0; zp->z_async_writes_cnt = 0; return (0); } static void zfs_znode_cache_destructor(void *buf, void *arg) { (void) arg; znode_t *zp = buf; ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs)); ASSERT3P(zp->z_vnode, ==, NULL); ASSERT(!list_link_active(&zp->z_link_node)); mutex_destroy(&zp->z_lock); mutex_destroy(&zp->z_acl_lock); rw_destroy(&zp->z_xattr_lock); zfs_rangelock_fini(&zp->z_rangelock); ASSERT3P(zp->z_acl_cached, ==, NULL); ASSERT3P(zp->z_xattr_cached, ==, NULL); ASSERT0(atomic_load_32(&zp->z_sync_writes_cnt)); ASSERT0(atomic_load_32(&zp->z_async_writes_cnt)); } #ifdef _ZFS_USE_SMR VFS_SMR_DECLARE; static int zfs_znode_cache_constructor_smr(void *mem, int size __unused, void *private, int flags) { return (zfs_znode_cache_constructor(mem, private, flags)); } static void zfs_znode_cache_destructor_smr(void *mem, int size __unused, void *private) { zfs_znode_cache_destructor(mem, private); } void zfs_znode_init(void) { /* * Initialize zcache */ ASSERT3P(znode_uma_zone, ==, NULL); znode_uma_zone = uma_zcreate("zfs_znode_cache", sizeof (znode_t), zfs_znode_cache_constructor_smr, zfs_znode_cache_destructor_smr, NULL, NULL, 0, 0); VFS_SMR_ZONE_SET(znode_uma_zone); } static znode_t * zfs_znode_alloc_kmem(int flags) { return (uma_zalloc_smr(znode_uma_zone, flags)); } static void zfs_znode_free_kmem(znode_t *zp) { if (zp->z_xattr_cached) { nvlist_free(zp->z_xattr_cached); zp->z_xattr_cached = NULL; } uma_zfree_smr(znode_uma_zone, zp); } #else void zfs_znode_init(void) { /* * Initialize zcache */ ASSERT3P(znode_cache, ==, NULL); znode_cache = kmem_cache_create("zfs_znode_cache", sizeof (znode_t), 0, zfs_znode_cache_constructor, zfs_znode_cache_destructor, NULL, NULL, NULL, 0); } static znode_t * zfs_znode_alloc_kmem(int flags) { return (kmem_cache_alloc(znode_cache, flags)); } static void zfs_znode_free_kmem(znode_t *zp) { if (zp->z_xattr_cached) { nvlist_free(zp->z_xattr_cached); zp->z_xattr_cached = NULL; } kmem_cache_free(znode_cache, zp); } #endif void zfs_znode_fini(void) { /* * Cleanup zcache */ #ifdef _ZFS_USE_SMR if (znode_uma_zone) { uma_zdestroy(znode_uma_zone); znode_uma_zone = NULL; } #else if (znode_cache) { kmem_cache_destroy(znode_cache); znode_cache = NULL; } #endif } static int zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx) { zfs_acl_ids_t acl_ids; vattr_t vattr; znode_t *sharezp; znode_t *zp; int error; vattr.va_mask = AT_MODE|AT_UID|AT_GID; vattr.va_type = VDIR; vattr.va_mode = S_IFDIR|0555; vattr.va_uid = crgetuid(kcred); vattr.va_gid = crgetgid(kcred); sharezp = zfs_znode_alloc_kmem(KM_SLEEP); ASSERT(!POINTER_IS_VALID(sharezp->z_zfsvfs)); sharezp->z_unlinked = 0; sharezp->z_atime_dirty = 0; sharezp->z_zfsvfs = zfsvfs; sharezp->z_is_sa = zfsvfs->z_use_sa; VERIFY0(zfs_acl_ids_create(sharezp, IS_ROOT_NODE, &vattr, kcred, NULL, &acl_ids)); zfs_mknode(sharezp, &vattr, tx, kcred, IS_ROOT_NODE, &zp, &acl_ids); ASSERT3P(zp, ==, sharezp); POINTER_INVALIDATE(&sharezp->z_zfsvfs); error = zap_add(zfsvfs->z_os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1, &sharezp->z_id, tx); zfsvfs->z_shares_dir = sharezp->z_id; zfs_acl_ids_free(&acl_ids); sa_handle_destroy(sharezp->z_sa_hdl); zfs_znode_free_kmem(sharezp); return (error); } /* * define a couple of values we need available * for both 64 and 32 bit environments. */ #ifndef NBITSMINOR64 #define NBITSMINOR64 32 #endif #ifndef MAXMAJ64 #define MAXMAJ64 0xffffffffUL #endif #ifndef MAXMIN64 #define MAXMIN64 0xffffffffUL #endif /* * Create special expldev for ZFS private use. * Can't use standard expldev since it doesn't do * what we want. The standard expldev() takes a * dev32_t in LP64 and expands it to a long dev_t. * We need an interface that takes a dev32_t in ILP32 * and expands it to a long dev_t. */ static uint64_t zfs_expldev(dev_t dev) { return (((uint64_t)major(dev) << NBITSMINOR64) | minor(dev)); } /* * Special cmpldev for ZFS private use. * Can't use standard cmpldev since it takes * a long dev_t and compresses it to dev32_t in * LP64. We need to do a compaction of a long dev_t * to a dev32_t in ILP32. */ dev_t zfs_cmpldev(uint64_t dev) { return (makedev((dev >> NBITSMINOR64), (dev & MAXMIN64))); } static void zfs_znode_sa_init(zfsvfs_t *zfsvfs, znode_t *zp, dmu_buf_t *db, dmu_object_type_t obj_type, sa_handle_t *sa_hdl) { ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs) || (zfsvfs == zp->z_zfsvfs)); ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zfsvfs, zp->z_id))); ASSERT3P(zp->z_sa_hdl, ==, NULL); ASSERT3P(zp->z_acl_cached, ==, NULL); if (sa_hdl == NULL) { VERIFY0(sa_handle_get_from_db(zfsvfs->z_os, db, zp, SA_HDL_SHARED, &zp->z_sa_hdl)); } else { zp->z_sa_hdl = sa_hdl; sa_set_userp(sa_hdl, zp); } zp->z_is_sa = (obj_type == DMU_OT_SA) ? B_TRUE : B_FALSE; /* * Slap on VROOT if we are the root znode unless we are the root * node of a snapshot mounted under .zfs. */ if (zp->z_id == zfsvfs->z_root && zfsvfs->z_parent == zfsvfs) ZTOV(zp)->v_flag |= VROOT; vn_exists(ZTOV(zp)); } void zfs_znode_dmu_fini(znode_t *zp) { ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zp->z_zfsvfs, zp->z_id)) || zp->z_unlinked || ZFS_TEARDOWN_INACTIVE_WRITE_HELD(zp->z_zfsvfs)); sa_handle_destroy(zp->z_sa_hdl); zp->z_sa_hdl = NULL; } static void zfs_vnode_forget(vnode_t *vp) { /* copied from insmntque_stddtr */ vp->v_data = NULL; vp->v_op = &dead_vnodeops; vgone(vp); vput(vp); } /* * Construct a new znode/vnode and initialize. * * This does not do a call to dmu_set_user() that is * up to the caller to do, in case you don't want to * return the znode */ static znode_t * zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz, dmu_object_type_t obj_type, sa_handle_t *hdl) { znode_t *zp; vnode_t *vp; uint64_t mode; uint64_t parent; #ifdef notyet uint64_t mtime[2], ctime[2]; #endif uint64_t projid = ZFS_DEFAULT_PROJID; sa_bulk_attr_t bulk[9]; int count = 0; int error; zp = zfs_znode_alloc_kmem(KM_SLEEP); #ifndef _ZFS_USE_SMR KASSERT((zfsvfs->z_parent->z_vfs->mnt_kern_flag & MNTK_FPLOOKUP) == 0, ("%s: fast path lookup enabled without smr", __func__)); #endif #if __FreeBSD_version >= 1300076 KASSERT(curthread->td_vp_reserved != NULL, ("zfs_znode_alloc: getnewvnode without any vnodes reserved")); #else KASSERT(curthread->td_vp_reserv > 0, ("zfs_znode_alloc: getnewvnode without any vnodes reserved")); #endif error = getnewvnode("zfs", zfsvfs->z_parent->z_vfs, &zfs_vnodeops, &vp); if (error != 0) { zfs_znode_free_kmem(zp); return (NULL); } zp->z_vnode = vp; vp->v_data = zp; ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs)); zp->z_sa_hdl = NULL; zp->z_unlinked = 0; zp->z_atime_dirty = 0; zp->z_mapcnt = 0; zp->z_id = db->db_object; zp->z_blksz = blksz; zp->z_seq = 0x7A4653; zp->z_sync_cnt = 0; zp->z_sync_writes_cnt = 0; zp->z_async_writes_cnt = 0; #if __FreeBSD_version >= 1300139 atomic_store_ptr(&zp->z_cached_symlink, NULL); #endif vp = ZTOV(zp); zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, &zp->z_gen, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, &zp->z_size, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, &zp->z_links, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, &zp->z_atime, 16); #ifdef notyet SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); #endif SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &zp->z_uid, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &zp->z_gid, 8); if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count) != 0 || zp->z_gen == 0 || (dmu_objset_projectquota_enabled(zfsvfs->z_os) && (zp->z_pflags & ZFS_PROJID) && sa_lookup(zp->z_sa_hdl, SA_ZPL_PROJID(zfsvfs), &projid, 8) != 0)) { if (hdl == NULL) sa_handle_destroy(zp->z_sa_hdl); zfs_vnode_forget(vp); zp->z_vnode = NULL; zfs_znode_free_kmem(zp); return (NULL); } zp->z_projid = projid; zp->z_mode = mode; /* Cache the xattr parent id */ if (zp->z_pflags & ZFS_XATTR) zp->z_xattr_parent = parent; vp->v_type = IFTOVT((mode_t)mode); switch (vp->v_type) { case VDIR: zp->z_zn_prefetch = B_TRUE; /* z_prefetch default is enabled */ break; case VFIFO: vp->v_op = &zfs_fifoops; break; case VREG: if (parent == zfsvfs->z_shares_dir) { ASSERT0(zp->z_uid); ASSERT0(zp->z_gid); vp->v_op = &zfs_shareops; } break; default: break; } mutex_enter(&zfsvfs->z_znodes_lock); list_insert_tail(&zfsvfs->z_all_znodes, zp); zfsvfs->z_nr_znodes++; zp->z_zfsvfs = zfsvfs; mutex_exit(&zfsvfs->z_znodes_lock); /* * Acquire vnode lock before making it available to the world. */ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); VN_LOCK_AREC(vp); if (vp->v_type != VFIFO) VN_LOCK_ASHARE(vp); return (zp); } static uint64_t empty_xattr; static uint64_t pad[4]; static zfs_acl_phys_t acl_phys; /* * Create a new DMU object to hold a zfs znode. * * IN: dzp - parent directory for new znode * vap - file attributes for new znode * tx - dmu transaction id for zap operations * cr - credentials of caller * flag - flags: * IS_ROOT_NODE - new object will be root * IS_XATTR - new object is an attribute * bonuslen - length of bonus buffer * setaclp - File/Dir initial ACL * fuidp - Tracks fuid allocation. * * OUT: zpp - allocated znode * */ void zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, uint_t flag, znode_t **zpp, zfs_acl_ids_t *acl_ids) { uint64_t crtime[2], atime[2], mtime[2], ctime[2]; uint64_t mode, size, links, parent, pflags; uint64_t dzp_pflags = 0; uint64_t rdev = 0; zfsvfs_t *zfsvfs = dzp->z_zfsvfs; dmu_buf_t *db; timestruc_t now; uint64_t gen, obj; int bonuslen; int dnodesize; sa_handle_t *sa_hdl; dmu_object_type_t obj_type; sa_bulk_attr_t *sa_attrs; int cnt = 0; zfs_acl_locator_cb_t locate = { 0 }; ASSERT3P(vap, !=, NULL); ASSERT3U((vap->va_mask & AT_MODE), ==, AT_MODE); if (zfsvfs->z_replay) { obj = vap->va_nodeid; now = vap->va_ctime; /* see zfs_replay_create() */ gen = vap->va_nblocks; /* ditto */ dnodesize = vap->va_fsid; /* ditto */ } else { obj = 0; vfs_timestamp(&now); gen = dmu_tx_get_txg(tx); dnodesize = dmu_objset_dnodesize(zfsvfs->z_os); } if (dnodesize == 0) dnodesize = DNODE_MIN_SIZE; obj_type = zfsvfs->z_use_sa ? DMU_OT_SA : DMU_OT_ZNODE; bonuslen = (obj_type == DMU_OT_SA) ? DN_BONUS_SIZE(dnodesize) : ZFS_OLD_ZNODE_PHYS_SIZE; /* * Create a new DMU object. */ /* * There's currently no mechanism for pre-reading the blocks that will * be needed to allocate a new object, so we accept the small chance * that there will be an i/o error and we will fail one of the * assertions below. */ if (vap->va_type == VDIR) { if (zfsvfs->z_replay) { VERIFY0(zap_create_claim_norm_dnsize(zfsvfs->z_os, obj, zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS, obj_type, bonuslen, dnodesize, tx)); } else { obj = zap_create_norm_dnsize(zfsvfs->z_os, zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS, obj_type, bonuslen, dnodesize, tx); } } else { if (zfsvfs->z_replay) { VERIFY0(dmu_object_claim_dnsize(zfsvfs->z_os, obj, DMU_OT_PLAIN_FILE_CONTENTS, 0, obj_type, bonuslen, dnodesize, tx)); } else { obj = dmu_object_alloc_dnsize(zfsvfs->z_os, DMU_OT_PLAIN_FILE_CONTENTS, 0, obj_type, bonuslen, dnodesize, tx); } } ZFS_OBJ_HOLD_ENTER(zfsvfs, obj); VERIFY0(sa_buf_hold(zfsvfs->z_os, obj, NULL, &db)); /* * If this is the root, fix up the half-initialized parent pointer * to reference the just-allocated physical data area. */ if (flag & IS_ROOT_NODE) { dzp->z_id = obj; } else { dzp_pflags = dzp->z_pflags; } /* * If parent is an xattr, so am I. */ if (dzp_pflags & ZFS_XATTR) { flag |= IS_XATTR; } if (zfsvfs->z_use_fuids) pflags = ZFS_ARCHIVE | ZFS_AV_MODIFIED; else pflags = 0; if (vap->va_type == VDIR) { size = 2; /* contents ("." and "..") */ links = (flag & (IS_ROOT_NODE | IS_XATTR)) ? 2 : 1; } else { size = links = 0; } if (vap->va_type == VBLK || vap->va_type == VCHR) { rdev = zfs_expldev(vap->va_rdev); } parent = dzp->z_id; mode = acl_ids->z_mode; if (flag & IS_XATTR) pflags |= ZFS_XATTR; /* * No execs denied will be determined when zfs_mode_compute() is called. */ pflags |= acl_ids->z_aclp->z_hints & (ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|ZFS_ACL_AUTO_INHERIT| ZFS_ACL_DEFAULTED|ZFS_ACL_PROTECTED); ZFS_TIME_ENCODE(&now, crtime); ZFS_TIME_ENCODE(&now, ctime); if (vap->va_mask & AT_ATIME) { ZFS_TIME_ENCODE(&vap->va_atime, atime); } else { ZFS_TIME_ENCODE(&now, atime); } if (vap->va_mask & AT_MTIME) { ZFS_TIME_ENCODE(&vap->va_mtime, mtime); } else { ZFS_TIME_ENCODE(&now, mtime); } /* Now add in all of the "SA" attributes */ VERIFY0(sa_handle_get_from_db(zfsvfs->z_os, db, NULL, SA_HDL_SHARED, &sa_hdl)); /* * Setup the array of attributes to be replaced/set on the new file * * order for DMU_OT_ZNODE is critical since it needs to be constructed * in the old znode_phys_t format. Don't change this ordering */ sa_attrs = kmem_alloc(sizeof (sa_bulk_attr_t) * ZPL_END, KM_SLEEP); if (obj_type == DMU_OT_ZNODE) { SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs), NULL, &gen, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs), NULL, &size, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8); } else { SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs), NULL, &size, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs), NULL, &gen, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), NULL, &acl_ids->z_fuid, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), NULL, &acl_ids->z_fgid, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs), NULL, &pflags, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16); } SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8); if (obj_type == DMU_OT_ZNODE) { SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_XATTR(zfsvfs), NULL, &empty_xattr, 8); } if (obj_type == DMU_OT_ZNODE || (vap->va_type == VBLK || vap->va_type == VCHR)) { SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_RDEV(zfsvfs), NULL, &rdev, 8); } if (obj_type == DMU_OT_ZNODE) { SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs), NULL, &pflags, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), NULL, &acl_ids->z_fuid, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), NULL, &acl_ids->z_fgid, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PAD(zfsvfs), NULL, pad, sizeof (uint64_t) * 4); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ZNODE_ACL(zfsvfs), NULL, &acl_phys, sizeof (zfs_acl_phys_t)); } else if (acl_ids->z_aclp->z_version >= ZFS_ACL_VERSION_FUID) { SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_COUNT(zfsvfs), NULL, &acl_ids->z_aclp->z_acl_count, 8); locate.cb_aclp = acl_ids->z_aclp; SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_ACES(zfsvfs), zfs_acl_data_locator, &locate, acl_ids->z_aclp->z_acl_bytes); mode = zfs_mode_compute(mode, acl_ids->z_aclp, &pflags, acl_ids->z_fuid, acl_ids->z_fgid); } VERIFY0(sa_replace_all_by_template(sa_hdl, sa_attrs, cnt, tx)); if (!(flag & IS_ROOT_NODE)) { *zpp = zfs_znode_alloc(zfsvfs, db, 0, obj_type, sa_hdl); ASSERT3P(*zpp, !=, NULL); } else { /* * If we are creating the root node, the "parent" we * passed in is the znode for the root. */ *zpp = dzp; (*zpp)->z_sa_hdl = sa_hdl; } (*zpp)->z_pflags = pflags; (*zpp)->z_mode = mode; (*zpp)->z_dnodesize = dnodesize; if (vap->va_mask & AT_XVATTR) zfs_xvattr_set(*zpp, (xvattr_t *)vap, tx); if (obj_type == DMU_OT_ZNODE || acl_ids->z_aclp->z_version < ZFS_ACL_VERSION_FUID) { VERIFY0(zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx)); } if (!(flag & IS_ROOT_NODE)) { vnode_t *vp = ZTOV(*zpp); vp->v_vflag |= VV_FORCEINSMQ; int err = insmntque(vp, zfsvfs->z_vfs); vp->v_vflag &= ~VV_FORCEINSMQ; (void) err; KASSERT(err == 0, ("insmntque() failed: error %d", err)); } kmem_free(sa_attrs, sizeof (sa_bulk_attr_t) * ZPL_END); ZFS_OBJ_HOLD_EXIT(zfsvfs, obj); } /* * Update in-core attributes. It is assumed the caller will be doing an * sa_bulk_update to push the changes out. */ void zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx) { xoptattr_t *xoap; xoap = xva_getxoptattr(xvap); ASSERT3P(xoap, !=, NULL); if (zp->z_zfsvfs->z_replay == B_FALSE) { ASSERT_VOP_IN_SEQC(ZTOV(zp)); } if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) { uint64_t times[2]; ZFS_TIME_ENCODE(&xoap->xoa_createtime, times); (void) sa_update(zp->z_sa_hdl, SA_ZPL_CRTIME(zp->z_zfsvfs), ×, sizeof (times), tx); XVA_SET_RTN(xvap, XAT_CREATETIME); } if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_READONLY); } if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_HIDDEN); } if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_SYSTEM); } if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_ARCHIVE); } if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_IMMUTABLE); } if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_NOUNLINK); } if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_APPENDONLY); } if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_NODUMP); } if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_OPAQUE); } if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { ZFS_ATTR_SET(zp, ZFS_AV_QUARANTINED, xoap->xoa_av_quarantined, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); } if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_AV_MODIFIED); } if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) { zfs_sa_set_scanstamp(zp, xvap, tx); XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP); } if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { ZFS_ATTR_SET(zp, ZFS_REPARSE, xoap->xoa_reparse, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_REPARSE); } if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) { ZFS_ATTR_SET(zp, ZFS_OFFLINE, xoap->xoa_offline, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_OFFLINE); } if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) { ZFS_ATTR_SET(zp, ZFS_SPARSE, xoap->xoa_sparse, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_SPARSE); } } int zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp) { dmu_object_info_t doi; dmu_buf_t *db; znode_t *zp; vnode_t *vp; sa_handle_t *hdl; int locked; int err; getnewvnode_reserve_(); again: *zpp = NULL; ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num); err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db); if (err) { ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); getnewvnode_drop_reserve(); return (err); } dmu_object_info_from_db(db, &doi); if (doi.doi_bonus_type != DMU_OT_SA && (doi.doi_bonus_type != DMU_OT_ZNODE || (doi.doi_bonus_type == DMU_OT_ZNODE && doi.doi_bonus_size < sizeof (znode_phys_t)))) { sa_buf_rele(db, NULL); ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); getnewvnode_drop_reserve(); return (SET_ERROR(EINVAL)); } hdl = dmu_buf_get_user(db); if (hdl != NULL) { zp = sa_get_userdata(hdl); /* * Since "SA" does immediate eviction we * should never find a sa handle that doesn't * know about the znode. */ ASSERT3P(zp, !=, NULL); ASSERT3U(zp->z_id, ==, obj_num); if (zp->z_unlinked) { err = SET_ERROR(ENOENT); } else { vp = ZTOV(zp); /* * Don't let the vnode disappear after * ZFS_OBJ_HOLD_EXIT. */ VN_HOLD(vp); *zpp = zp; err = 0; } sa_buf_rele(db, NULL); ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); if (err) { getnewvnode_drop_reserve(); return (err); } locked = VOP_ISLOCKED(vp); VI_LOCK(vp); if (VN_IS_DOOMED(vp) && locked != LK_EXCLUSIVE) { /* * The vnode is doomed and this thread doesn't * hold the exclusive lock on it, so the vnode * must be being reclaimed by another thread. * Otherwise the doomed vnode is being reclaimed * by this thread and zfs_zget is called from * ZIL internals. */ VI_UNLOCK(vp); /* * XXX vrele() locks the vnode when the last reference * is dropped. Although in this case the vnode is * doomed / dead and so no inactivation is required, * the vnode lock is still acquired. That could result * in a LOR with z_teardown_lock if another thread holds * the vnode's lock and tries to take z_teardown_lock. * But that is only possible if the other thread peforms * a ZFS vnode operation on the vnode. That either * should not happen if the vnode is dead or the thread * should also have a reference to the vnode and thus * our reference is not last. */ VN_RELE(vp); goto again; } VI_UNLOCK(vp); getnewvnode_drop_reserve(); return (err); } /* * Not found create new znode/vnode * but only if file exists. * * There is a small window where zfs_vget() could * find this object while a file create is still in * progress. This is checked for in zfs_znode_alloc() * * if zfs_znode_alloc() fails it will drop the hold on the * bonus buffer. */ zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size, doi.doi_bonus_type, NULL); if (zp == NULL) { err = SET_ERROR(ENOENT); } else { *zpp = zp; } if (err == 0) { vnode_t *vp = ZTOV(zp); err = insmntque(vp, zfsvfs->z_vfs); if (err == 0) { vp->v_hash = obj_num; VOP_UNLOCK1(vp); } else { zp->z_vnode = NULL; zfs_znode_dmu_fini(zp); zfs_znode_free(zp); *zpp = NULL; } } ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); getnewvnode_drop_reserve(); return (err); } int zfs_rezget(znode_t *zp) { zfsvfs_t *zfsvfs = zp->z_zfsvfs; dmu_object_info_t doi; dmu_buf_t *db; vnode_t *vp; uint64_t obj_num = zp->z_id; uint64_t mode, size; sa_bulk_attr_t bulk[8]; int err; int count = 0; uint64_t gen; /* * Remove cached pages before reloading the znode, so that they are not * lingering after we run into any error. Ideally, we should vgone() * the vnode in case of error, but currently we cannot do that * because of the LOR between the vnode lock and z_teardown_lock. * So, instead, we have to "doom" the znode in the illumos style. * * Ignore invalid pages during the scan. This is to avoid deadlocks * between page busying and the teardown lock, as pages are busied prior * to a VOP_GETPAGES operation, which acquires the teardown read lock. * Such pages will be invalid and can safely be skipped here. */ vp = ZTOV(zp); #if __FreeBSD_version >= 1400042 vn_pages_remove_valid(vp, 0, 0); #else vn_pages_remove(vp, 0, 0); #endif ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num); mutex_enter(&zp->z_acl_lock); if (zp->z_acl_cached) { zfs_acl_free(zp->z_acl_cached); zp->z_acl_cached = NULL; } mutex_exit(&zp->z_acl_lock); rw_enter(&zp->z_xattr_lock, RW_WRITER); if (zp->z_xattr_cached) { nvlist_free(zp->z_xattr_cached); zp->z_xattr_cached = NULL; } rw_exit(&zp->z_xattr_lock); ASSERT3P(zp->z_sa_hdl, ==, NULL); err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db); if (err) { ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); return (err); } dmu_object_info_from_db(db, &doi); if (doi.doi_bonus_type != DMU_OT_SA && (doi.doi_bonus_type != DMU_OT_ZNODE || (doi.doi_bonus_type == DMU_OT_ZNODE && doi.doi_bonus_size < sizeof (znode_phys_t)))) { sa_buf_rele(db, NULL); ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); return (SET_ERROR(EINVAL)); } zfs_znode_sa_init(zfsvfs, zp, db, doi.doi_bonus_type, NULL); size = zp->z_size; /* reload cached values */ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, &gen, sizeof (gen)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, &zp->z_size, sizeof (zp->z_size)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, &zp->z_links, sizeof (zp->z_links)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, sizeof (zp->z_pflags)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, &zp->z_atime, sizeof (zp->z_atime)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &zp->z_uid, sizeof (zp->z_uid)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &zp->z_gid, sizeof (zp->z_gid)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, sizeof (mode)); if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) { zfs_znode_dmu_fini(zp); ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); return (SET_ERROR(EIO)); } zp->z_mode = mode; if (gen != zp->z_gen) { zfs_znode_dmu_fini(zp); ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); return (SET_ERROR(EIO)); } /* * It is highly improbable but still quite possible that two * objects in different datasets are created with the same * object numbers and in transaction groups with the same * numbers. znodes corresponding to those objects would * have the same z_id and z_gen, but their other attributes * may be different. * zfs recv -F may replace one of such objects with the other. * As a result file properties recorded in the replaced * object's vnode may no longer match the received object's * properties. At present the only cached property is the * files type recorded in v_type. * So, handle this case by leaving the old vnode and znode * disassociated from the actual object. A new vnode and a * znode will be created if the object is accessed * (e.g. via a look-up). The old vnode and znode will be * recycled when the last vnode reference is dropped. */ if (vp->v_type != IFTOVT((mode_t)zp->z_mode)) { zfs_znode_dmu_fini(zp); ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); return (SET_ERROR(EIO)); } /* * If the file has zero links, then it has been unlinked on the send * side and it must be in the received unlinked set. * We call zfs_znode_dmu_fini() now to prevent any accesses to the * stale data and to prevent automatically removal of the file in * zfs_zinactive(). The file will be removed either when it is removed * on the send side and the next incremental stream is received or * when the unlinked set gets processed. */ zp->z_unlinked = (zp->z_links == 0); if (zp->z_unlinked) { zfs_znode_dmu_fini(zp); ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); return (0); } zp->z_blksz = doi.doi_data_block_size; if (zp->z_size != size) vnode_pager_setsize(vp, zp->z_size); ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); return (0); } void zfs_znode_delete(znode_t *zp, dmu_tx_t *tx) { zfsvfs_t *zfsvfs = zp->z_zfsvfs; objset_t *os = zfsvfs->z_os; uint64_t obj = zp->z_id; uint64_t acl_obj = zfs_external_acl(zp); ZFS_OBJ_HOLD_ENTER(zfsvfs, obj); if (acl_obj) { VERIFY(!zp->z_is_sa); VERIFY0(dmu_object_free(os, acl_obj, tx)); } VERIFY0(dmu_object_free(os, obj, tx)); zfs_znode_dmu_fini(zp); ZFS_OBJ_HOLD_EXIT(zfsvfs, obj); zfs_znode_free(zp); } void zfs_zinactive(znode_t *zp) { zfsvfs_t *zfsvfs = zp->z_zfsvfs; uint64_t z_id = zp->z_id; ASSERT3P(zp->z_sa_hdl, !=, NULL); /* * Don't allow a zfs_zget() while were trying to release this znode */ ZFS_OBJ_HOLD_ENTER(zfsvfs, z_id); /* * If this was the last reference to a file with no links, remove * the file from the file system unless the file system is mounted * read-only. That can happen, for example, if the file system was * originally read-write, the file was opened, then unlinked and * the file system was made read-only before the file was finally * closed. The file will remain in the unlinked set. */ if (zp->z_unlinked) { ASSERT(!zfsvfs->z_issnap); if ((zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) == 0) { ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); zfs_rmnode(zp); return; } } zfs_znode_dmu_fini(zp); ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); zfs_znode_free(zp); } void zfs_znode_free(znode_t *zp) { zfsvfs_t *zfsvfs = zp->z_zfsvfs; #if __FreeBSD_version >= 1300139 char *symlink; #endif ASSERT3P(zp->z_sa_hdl, ==, NULL); zp->z_vnode = NULL; mutex_enter(&zfsvfs->z_znodes_lock); POINTER_INVALIDATE(&zp->z_zfsvfs); list_remove(&zfsvfs->z_all_znodes, zp); zfsvfs->z_nr_znodes--; mutex_exit(&zfsvfs->z_znodes_lock); #if __FreeBSD_version >= 1300139 symlink = atomic_load_ptr(&zp->z_cached_symlink); if (symlink != NULL) { atomic_store_rel_ptr((uintptr_t *)&zp->z_cached_symlink, (uintptr_t)NULL); cache_symlink_free(symlink, strlen(symlink) + 1); } #endif if (zp->z_acl_cached) { zfs_acl_free(zp->z_acl_cached); zp->z_acl_cached = NULL; } zfs_znode_free_kmem(zp); } void zfs_tstamp_update_setup_ext(znode_t *zp, uint_t flag, uint64_t mtime[2], uint64_t ctime[2], boolean_t have_tx) { timestruc_t now; vfs_timestamp(&now); if (have_tx) { /* will sa_bulk_update happen really soon? */ zp->z_atime_dirty = 0; zp->z_seq++; } else { zp->z_atime_dirty = 1; } if (flag & AT_ATIME) { ZFS_TIME_ENCODE(&now, zp->z_atime); } if (flag & AT_MTIME) { ZFS_TIME_ENCODE(&now, mtime); if (zp->z_zfsvfs->z_use_fuids) { zp->z_pflags |= (ZFS_ARCHIVE | ZFS_AV_MODIFIED); } } if (flag & AT_CTIME) { ZFS_TIME_ENCODE(&now, ctime); if (zp->z_zfsvfs->z_use_fuids) zp->z_pflags |= ZFS_ARCHIVE; } } void zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2], uint64_t ctime[2]) { zfs_tstamp_update_setup_ext(zp, flag, mtime, ctime, B_TRUE); } /* * Grow the block size for a file. * * IN: zp - znode of file to free data in. * size - requested block size * tx - open transaction. * * NOTE: this function assumes that the znode is write locked. */ void zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx) { int error; u_longlong_t dummy; if (size <= zp->z_blksz) return; /* * If the file size is already greater than the current blocksize, * we will not grow. If there is more than one block in a file, * the blocksize cannot change. */ if (zp->z_blksz && zp->z_size > zp->z_blksz) return; error = dmu_object_set_blocksize(zp->z_zfsvfs->z_os, zp->z_id, size, 0, tx); if (error == ENOTSUP) return; ASSERT0(error); /* What blocksize did we actually get? */ dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &zp->z_blksz, &dummy); } /* * Increase the file length * * IN: zp - znode of file to free data in. * end - new end-of-file * * RETURN: 0 on success, error code on failure */ static int zfs_extend(znode_t *zp, uint64_t end) { zfsvfs_t *zfsvfs = zp->z_zfsvfs; dmu_tx_t *tx; zfs_locked_range_t *lr; uint64_t newblksz; int error; /* * We will change zp_size, lock the whole file. */ lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER); /* * Nothing to do if file already at desired length. */ if (end <= zp->z_size) { zfs_rangelock_exit(lr); return (0); } tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); if (end > zp->z_blksz && (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) { /* * We are growing the file past the current block size. */ if (zp->z_blksz > zp->z_zfsvfs->z_max_blksz) { /* * File's blocksize is already larger than the * "recordsize" property. Only let it grow to * the next power of 2. */ ASSERT(!ISP2(zp->z_blksz)); newblksz = MIN(end, 1 << highbit64(zp->z_blksz)); } else { newblksz = MIN(end, zp->z_zfsvfs->z_max_blksz); } dmu_tx_hold_write(tx, zp->z_id, 0, newblksz); } else { newblksz = 0; } error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); zfs_rangelock_exit(lr); return (error); } if (newblksz) zfs_grow_blocksize(zp, newblksz, tx); zp->z_size = end; VERIFY0(sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zp->z_zfsvfs), &zp->z_size, sizeof (zp->z_size), tx)); vnode_pager_setsize(ZTOV(zp), end); zfs_rangelock_exit(lr); dmu_tx_commit(tx); return (0); } /* * Free space in a file. * * IN: zp - znode of file to free data in. * off - start of section to free. * len - length of section to free. * * RETURN: 0 on success, error code on failure */ static int zfs_free_range(znode_t *zp, uint64_t off, uint64_t len) { zfsvfs_t *zfsvfs = zp->z_zfsvfs; zfs_locked_range_t *lr; int error; /* * Lock the range being freed. */ lr = zfs_rangelock_enter(&zp->z_rangelock, off, len, RL_WRITER); /* * Nothing to do if file already at desired length. */ if (off >= zp->z_size) { zfs_rangelock_exit(lr); return (0); } if (off + len > zp->z_size) len = zp->z_size - off; error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len); if (error == 0) { #if __FreeBSD_version >= 1400032 vnode_pager_purge_range(ZTOV(zp), off, off + len); #else /* * Before __FreeBSD_version 1400032 we cannot free block in the * middle of a file, but only at the end of a file, so this code * path should never happen. */ vnode_pager_setsize(ZTOV(zp), off); #endif } zfs_rangelock_exit(lr); return (error); } /* * Truncate a file * * IN: zp - znode of file to free data in. * end - new end-of-file. * * RETURN: 0 on success, error code on failure */ static int zfs_trunc(znode_t *zp, uint64_t end) { zfsvfs_t *zfsvfs = zp->z_zfsvfs; vnode_t *vp = ZTOV(zp); dmu_tx_t *tx; zfs_locked_range_t *lr; int error; sa_bulk_attr_t bulk[2]; int count = 0; /* * We will change zp_size, lock the whole file. */ lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER); /* * Nothing to do if file already at desired length. */ if (end >= zp->z_size) { zfs_rangelock_exit(lr); return (0); } error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end, DMU_OBJECT_END); if (error) { zfs_rangelock_exit(lr); return (error); } tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); dmu_tx_mark_netfree(tx); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); zfs_rangelock_exit(lr); return (error); } zp->z_size = end; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, &zp->z_size, sizeof (zp->z_size)); if (end == 0) { zp->z_pflags &= ~ZFS_SPARSE; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, 8); } VERIFY0(sa_bulk_update(zp->z_sa_hdl, bulk, count, tx)); dmu_tx_commit(tx); /* * Clear any mapped pages in the truncated region. This has to * happen outside of the transaction to avoid the possibility of * a deadlock with someone trying to push a page that we are * about to invalidate. */ vnode_pager_setsize(vp, end); zfs_rangelock_exit(lr); return (0); } /* * Free space in a file * * IN: zp - znode of file to free data in. * off - start of range * len - end of range (0 => EOF) * flag - current file open mode flags. * log - TRUE if this action should be logged * * RETURN: 0 on success, error code on failure */ int zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log) { dmu_tx_t *tx; zfsvfs_t *zfsvfs = zp->z_zfsvfs; zilog_t *zilog = zfsvfs->z_log; uint64_t mode; uint64_t mtime[2], ctime[2]; sa_bulk_attr_t bulk[3]; int count = 0; int error; if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), &mode, sizeof (mode))) != 0) return (error); if (off > zp->z_size) { error = zfs_extend(zp, off+len); if (error == 0 && log) goto log; else return (error); } if (len == 0) { error = zfs_trunc(zp, off); } else { if ((error = zfs_free_range(zp, off, len)) == 0 && off + len > zp->z_size) error = zfs_extend(zp, off+len); } if (error || !log) return (error); log: tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); return (error); } SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, 8); zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime); error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); ASSERT0(error); zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len); dmu_tx_commit(tx); return (0); } void zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) { uint64_t moid, obj, sa_obj, version; uint64_t sense = ZFS_CASE_SENSITIVE; uint64_t norm = 0; nvpair_t *elem; int error; int i; znode_t *rootzp = NULL; zfsvfs_t *zfsvfs; vattr_t vattr; znode_t *zp; zfs_acl_ids_t acl_ids; /* * First attempt to create master node. */ /* * In an empty objset, there are no blocks to read and thus * there can be no i/o errors (which we assert below). */ moid = MASTER_NODE_OBJ; error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE, DMU_OT_NONE, 0, tx); ASSERT0(error); /* * Set starting attributes. */ version = zfs_zpl_version_map(spa_version(dmu_objset_spa(os))); elem = NULL; while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) { /* For the moment we expect all zpl props to be uint64_ts */ uint64_t val; char *name; ASSERT3S(nvpair_type(elem), ==, DATA_TYPE_UINT64); val = fnvpair_value_uint64(elem); name = nvpair_name(elem); if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) { if (val < version) version = val; } else { error = zap_update(os, moid, name, 8, 1, &val, tx); } ASSERT0(error); if (strcmp(name, zfs_prop_to_name(ZFS_PROP_NORMALIZE)) == 0) norm = val; else if (strcmp(name, zfs_prop_to_name(ZFS_PROP_CASE)) == 0) sense = val; } ASSERT3U(version, !=, 0); error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx); /* * Create zap object used for SA attribute registration */ if (version >= ZPL_VERSION_SA) { sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE, DMU_OT_NONE, 0, tx); error = zap_add(os, moid, ZFS_SA_ATTRS, 8, 1, &sa_obj, tx); ASSERT0(error); } else { sa_obj = 0; } /* * Create a delete queue. */ obj = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx); error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &obj, tx); ASSERT0(error); /* * Create root znode. Create minimal znode/vnode/zfsvfs * to allow zfs_mknode to work. */ VATTR_NULL(&vattr); vattr.va_mask = AT_MODE|AT_UID|AT_GID; vattr.va_type = VDIR; vattr.va_mode = S_IFDIR|0755; vattr.va_uid = crgetuid(cr); vattr.va_gid = crgetgid(cr); zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); rootzp = zfs_znode_alloc_kmem(KM_SLEEP); ASSERT(!POINTER_IS_VALID(rootzp->z_zfsvfs)); rootzp->z_unlinked = 0; rootzp->z_atime_dirty = 0; rootzp->z_is_sa = USE_SA(version, os); zfsvfs->z_os = os; zfsvfs->z_parent = zfsvfs; zfsvfs->z_version = version; zfsvfs->z_use_fuids = USE_FUIDS(version, os); zfsvfs->z_use_sa = USE_SA(version, os); zfsvfs->z_norm = norm; error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END, &zfsvfs->z_attr_table); ASSERT0(error); /* * Fold case on file systems that are always or sometimes case * insensitive. */ if (sense == ZFS_CASE_INSENSITIVE || sense == ZFS_CASE_MIXED) zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER; mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), offsetof(znode_t, z_link_node)); for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL); rootzp->z_zfsvfs = zfsvfs; VERIFY0(zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr, cr, NULL, &acl_ids)); zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, &acl_ids); ASSERT3P(zp, ==, rootzp); error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx); ASSERT0(error); zfs_acl_ids_free(&acl_ids); POINTER_INVALIDATE(&rootzp->z_zfsvfs); sa_handle_destroy(rootzp->z_sa_hdl); zfs_znode_free_kmem(rootzp); /* * Create shares directory */ error = zfs_create_share_dir(zfsvfs, tx); ASSERT0(error); for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) mutex_destroy(&zfsvfs->z_hold_mtx[i]); kmem_free(zfsvfs, sizeof (zfsvfs_t)); } #endif /* _KERNEL */ static int zfs_sa_setup(objset_t *osp, sa_attr_type_t **sa_table) { uint64_t sa_obj = 0; int error; error = zap_lookup(osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj); if (error != 0 && error != ENOENT) return (error); error = sa_setup(osp, sa_obj, zfs_attr_table, ZPL_END, sa_table); return (error); } static int zfs_grab_sa_handle(objset_t *osp, uint64_t obj, sa_handle_t **hdlp, dmu_buf_t **db, const void *tag) { dmu_object_info_t doi; int error; if ((error = sa_buf_hold(osp, obj, tag, db)) != 0) return (error); dmu_object_info_from_db(*db, &doi); if ((doi.doi_bonus_type != DMU_OT_SA && doi.doi_bonus_type != DMU_OT_ZNODE) || (doi.doi_bonus_type == DMU_OT_ZNODE && doi.doi_bonus_size < sizeof (znode_phys_t))) { sa_buf_rele(*db, tag); return (SET_ERROR(ENOTSUP)); } error = sa_handle_get(osp, obj, NULL, SA_HDL_PRIVATE, hdlp); if (error != 0) { sa_buf_rele(*db, tag); return (error); } return (0); } static void zfs_release_sa_handle(sa_handle_t *hdl, dmu_buf_t *db, const void *tag) { sa_handle_destroy(hdl); sa_buf_rele(db, tag); } /* * Given an object number, return its parent object number and whether * or not the object is an extended attribute directory. */ static int zfs_obj_to_pobj(objset_t *osp, sa_handle_t *hdl, sa_attr_type_t *sa_table, uint64_t *pobjp, int *is_xattrdir) { uint64_t parent; uint64_t pflags; uint64_t mode; uint64_t parent_mode; sa_bulk_attr_t bulk[3]; sa_handle_t *sa_hdl; dmu_buf_t *sa_db; int count = 0; int error; SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_PARENT], NULL, &parent, sizeof (parent)); SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_FLAGS], NULL, &pflags, sizeof (pflags)); SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL, &mode, sizeof (mode)); if ((error = sa_bulk_lookup(hdl, bulk, count)) != 0) return (error); /* * When a link is removed its parent pointer is not changed and will * be invalid. There are two cases where a link is removed but the * file stays around, when it goes to the delete queue and when there * are additional links. */ error = zfs_grab_sa_handle(osp, parent, &sa_hdl, &sa_db, FTAG); if (error != 0) return (error); error = sa_lookup(sa_hdl, ZPL_MODE, &parent_mode, sizeof (parent_mode)); zfs_release_sa_handle(sa_hdl, sa_db, FTAG); if (error != 0) return (error); *is_xattrdir = ((pflags & ZFS_XATTR) != 0) && S_ISDIR(mode); /* * Extended attributes can be applied to files, directories, etc. * Otherwise the parent must be a directory. */ if (!*is_xattrdir && !S_ISDIR(parent_mode)) return (SET_ERROR(EINVAL)); *pobjp = parent; return (0); } /* * Given an object number, return some zpl level statistics */ static int zfs_obj_to_stats_impl(sa_handle_t *hdl, sa_attr_type_t *sa_table, zfs_stat_t *sb) { sa_bulk_attr_t bulk[4]; int count = 0; SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL, &sb->zs_mode, sizeof (sb->zs_mode)); SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_GEN], NULL, &sb->zs_gen, sizeof (sb->zs_gen)); SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_LINKS], NULL, &sb->zs_links, sizeof (sb->zs_links)); SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_CTIME], NULL, &sb->zs_ctime, sizeof (sb->zs_ctime)); return (sa_bulk_lookup(hdl, bulk, count)); } static int zfs_obj_to_path_impl(objset_t *osp, uint64_t obj, sa_handle_t *hdl, sa_attr_type_t *sa_table, char *buf, int len) { sa_handle_t *sa_hdl; sa_handle_t *prevhdl = NULL; dmu_buf_t *prevdb = NULL; dmu_buf_t *sa_db = NULL; char *path = buf + len - 1; int error; *path = '\0'; sa_hdl = hdl; uint64_t deleteq_obj; VERIFY0(zap_lookup(osp, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, sizeof (uint64_t), 1, &deleteq_obj)); error = zap_lookup_int(osp, deleteq_obj, obj); if (error == 0) { return (ESTALE); } else if (error != ENOENT) { return (error); } - error = 0; for (;;) { uint64_t pobj; char component[MAXNAMELEN + 2]; size_t complen; int is_xattrdir; if (prevdb) { ASSERT3P(prevhdl, !=, NULL); zfs_release_sa_handle(prevhdl, prevdb, FTAG); } if ((error = zfs_obj_to_pobj(osp, sa_hdl, sa_table, &pobj, &is_xattrdir)) != 0) break; if (pobj == obj) { if (path[0] != '/') *--path = '/'; break; } component[0] = '/'; if (is_xattrdir) { (void) sprintf(component + 1, ""); } else { error = zap_value_search(osp, pobj, obj, ZFS_DIRENT_OBJ(-1ULL), component + 1); if (error != 0) break; } complen = strlen(component); path -= complen; ASSERT3P(path, >=, buf); memcpy(path, component, complen); obj = pobj; if (sa_hdl != hdl) { prevhdl = sa_hdl; prevdb = sa_db; } error = zfs_grab_sa_handle(osp, obj, &sa_hdl, &sa_db, FTAG); if (error != 0) { sa_hdl = prevhdl; sa_db = prevdb; break; } } if (sa_hdl != NULL && sa_hdl != hdl) { ASSERT3P(sa_db, !=, NULL); zfs_release_sa_handle(sa_hdl, sa_db, FTAG); } if (error == 0) (void) memmove(buf, path, buf + len - path); return (error); } int zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len) { sa_attr_type_t *sa_table; sa_handle_t *hdl; dmu_buf_t *db; int error; error = zfs_sa_setup(osp, &sa_table); if (error != 0) return (error); error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG); if (error != 0) return (error); error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len); zfs_release_sa_handle(hdl, db, FTAG); return (error); } int zfs_obj_to_stats(objset_t *osp, uint64_t obj, zfs_stat_t *sb, char *buf, int len) { char *path = buf + len - 1; sa_attr_type_t *sa_table; sa_handle_t *hdl; dmu_buf_t *db; int error; *path = '\0'; error = zfs_sa_setup(osp, &sa_table); if (error != 0) return (error); error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG); if (error != 0) return (error); error = zfs_obj_to_stats_impl(hdl, sa_table, sb); if (error != 0) { zfs_release_sa_handle(hdl, db, FTAG); return (error); } error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len); zfs_release_sa_handle(hdl, db, FTAG); return (error); } void zfs_znode_update_vfs(znode_t *zp) { vm_object_t object; if ((object = ZTOV(zp)->v_object) == NULL || zp->z_size == object->un_pager.vnp.vnp_size) return; vnode_pager_setsize(ZTOV(zp), zp->z_size); } #ifdef _KERNEL int zfs_znode_parent_and_name(znode_t *zp, znode_t **dzpp, char *buf) { zfsvfs_t *zfsvfs = zp->z_zfsvfs; uint64_t parent; int is_xattrdir; int err; /* Extended attributes should not be visible as regular files. */ if ((zp->z_pflags & ZFS_XATTR) != 0) return (SET_ERROR(EINVAL)); err = zfs_obj_to_pobj(zfsvfs->z_os, zp->z_sa_hdl, zfsvfs->z_attr_table, &parent, &is_xattrdir); if (err != 0) return (err); ASSERT0(is_xattrdir); /* No name as this is a root object. */ if (parent == zp->z_id) return (SET_ERROR(EINVAL)); err = zap_value_search(zfsvfs->z_os, parent, zp->z_id, ZFS_DIRENT_OBJ(-1ULL), buf); if (err != 0) return (err); err = zfs_zget(zfsvfs, parent, dzpp); return (err); } #endif /* _KERNEL */ diff --git a/module/os/freebsd/zfs/zio_crypt.c b/module/os/freebsd/zfs/zio_crypt.c index 0410ddd65a5c..c5e745f7d196 100644 --- a/module/os/freebsd/zfs/zio_crypt.c +++ b/module/os/freebsd/zfs/zio_crypt.c @@ -1,1821 +1,1820 @@ /* * CDDL HEADER START * * This file and its contents are supplied under the terms of the * Common Development and Distribution License ("CDDL"), version 1.0. * You may only use this file in accordance with the terms of version * 1.0 of the CDDL. * * A full copy of the text of the CDDL should have accompanied this * source. A copy of the CDDL is also available via the Internet at * http://www.illumos.org/license/CDDL. * * CDDL HEADER END */ /* * Copyright (c) 2017, Datto, Inc. All rights reserved. */ #include #include #include #include #include #include #include #include #include /* * This file is responsible for handling all of the details of generating * encryption parameters and performing encryption and authentication. * * BLOCK ENCRYPTION PARAMETERS: * Encryption /Authentication Algorithm Suite (crypt): * The encryption algorithm, mode, and key length we are going to use. We * currently support AES in either GCM or CCM modes with 128, 192, and 256 bit * keys. All authentication is currently done with SHA512-HMAC. * * Plaintext: * The unencrypted data that we want to encrypt. * * Initialization Vector (IV): * An initialization vector for the encryption algorithms. This is used to * "tweak" the encryption algorithms so that two blocks of the same data are * encrypted into different ciphertext outputs, thus obfuscating block patterns. * The supported encryption modes (AES-GCM and AES-CCM) require that an IV is * never reused with the same encryption key. This value is stored unencrypted * and must simply be provided to the decryption function. We use a 96 bit IV * (as recommended by NIST) for all block encryption. For non-dedup blocks we * derive the IV randomly. The first 64 bits of the IV are stored in the second * word of DVA[2] and the remaining 32 bits are stored in the upper 32 bits of * blk_fill. This is safe because encrypted blocks can't use the upper 32 bits * of blk_fill. We only encrypt level 0 blocks, which normally have a fill count * of 1. The only exception is for DMU_OT_DNODE objects, where the fill count of * level 0 blocks is the number of allocated dnodes in that block. The on-disk * format supports at most 2^15 slots per L0 dnode block, because the maximum * block size is 16MB (2^24). In either case, for level 0 blocks this number * will still be smaller than UINT32_MAX so it is safe to store the IV in the * top 32 bits of blk_fill, while leaving the bottom 32 bits of the fill count * for the dnode code. * * Master key: * This is the most important secret data of an encrypted dataset. It is used * along with the salt to generate that actual encryption keys via HKDF. We * do not use the master key to directly encrypt any data because there are * theoretical limits on how much data can actually be safely encrypted with * any encryption mode. The master key is stored encrypted on disk with the * user's wrapping key. Its length is determined by the encryption algorithm. * For details on how this is stored see the block comment in dsl_crypt.c * * Salt: * Used as an input to the HKDF function, along with the master key. We use a * 64 bit salt, stored unencrypted in the first word of DVA[2]. Any given salt * can be used for encrypting many blocks, so we cache the current salt and the * associated derived key in zio_crypt_t so we do not need to derive it again * needlessly. * * Encryption Key: * A secret binary key, generated from an HKDF function used to encrypt and * decrypt data. * * Message Authentication Code (MAC) * The MAC is an output of authenticated encryption modes such as AES-GCM and * AES-CCM. Its purpose is to ensure that an attacker cannot modify encrypted * data on disk and return garbage to the application. Effectively, it is a * checksum that can not be reproduced by an attacker. We store the MAC in the * second 128 bits of blk_cksum, leaving the first 128 bits for a truncated * regular checksum of the ciphertext which can be used for scrubbing. * * OBJECT AUTHENTICATION: * Some object types, such as DMU_OT_MASTER_NODE cannot be encrypted because * they contain some info that always needs to be readable. To prevent this * data from being altered, we authenticate this data using SHA512-HMAC. This * will produce a MAC (similar to the one produced via encryption) which can * be used to verify the object was not modified. HMACs do not require key * rotation or IVs, so we can keep up to the full 3 copies of authenticated * data. * * ZIL ENCRYPTION: * ZIL blocks have their bp written to disk ahead of the associated data, so we * cannot store the MAC there as we normally do. For these blocks the MAC is * stored in the embedded checksum within the zil_chain_t header. The salt and * IV are generated for the block on bp allocation instead of at encryption * time. In addition, ZIL blocks have some pieces that must be left in plaintext * for claiming even though all of the sensitive user data still needs to be * encrypted. The function zio_crypt_init_uios_zil() handles parsing which * pieces of the block need to be encrypted. All data that is not encrypted is * authenticated using the AAD mechanisms that the supported encryption modes * provide for. In order to preserve the semantics of the ZIL for encrypted * datasets, the ZIL is not protected at the objset level as described below. * * DNODE ENCRYPTION: * Similarly to ZIL blocks, the core part of each dnode_phys_t needs to be left * in plaintext for scrubbing and claiming, but the bonus buffers might contain * sensitive user data. The function zio_crypt_init_uios_dnode() handles parsing * which pieces of the block need to be encrypted. For more details about * dnode authentication and encryption, see zio_crypt_init_uios_dnode(). * * OBJECT SET AUTHENTICATION: * Up to this point, everything we have encrypted and authenticated has been * at level 0 (or -2 for the ZIL). If we did not do any further work the * on-disk format would be susceptible to attacks that deleted or rearranged * the order of level 0 blocks. Ideally, the cleanest solution would be to * maintain a tree of authentication MACs going up the bp tree. However, this * presents a problem for raw sends. Send files do not send information about * indirect blocks so there would be no convenient way to transfer the MACs and * they cannot be recalculated on the receive side without the master key which * would defeat one of the purposes of raw sends in the first place. Instead, * for the indirect levels of the bp tree, we use a regular SHA512 of the MACs * from the level below. We also include some portable fields from blk_prop such * as the lsize and compression algorithm to prevent the data from being * misinterpreted. * * At the objset level, we maintain 2 separate 256 bit MACs in the * objset_phys_t. The first one is "portable" and is the logical root of the * MAC tree maintained in the metadnode's bps. The second, is "local" and is * used as the root MAC for the user accounting objects, which are also not * transferred via "zfs send". The portable MAC is sent in the DRR_BEGIN payload * of the send file. The useraccounting code ensures that the useraccounting * info is not present upon a receive, so the local MAC can simply be cleared * out at that time. For more info about objset_phys_t authentication, see * zio_crypt_do_objset_hmacs(). * * CONSIDERATIONS FOR DEDUP: * In order for dedup to work, blocks that we want to dedup with one another * need to use the same IV and encryption key, so that they will have the same * ciphertext. Normally, one should never reuse an IV with the same encryption * key or else AES-GCM and AES-CCM can both actually leak the plaintext of both * blocks. In this case, however, since we are using the same plaintext as * well all that we end up with is a duplicate of the original ciphertext we * already had. As a result, an attacker with read access to the raw disk will * be able to tell which blocks are the same but this information is given away * by dedup anyway. In order to get the same IVs and encryption keys for * equivalent blocks of data we use an HMAC of the plaintext. We use an HMAC * here so that a reproducible checksum of the plaintext is never available to * the attacker. The HMAC key is kept alongside the master key, encrypted on * disk. The first 64 bits of the HMAC are used in place of the random salt, and * the next 96 bits are used as the IV. As a result of this mechanism, dedup * will only work within a clone family since encrypted dedup requires use of * the same master and HMAC keys. */ /* * After encrypting many blocks with the same key we may start to run up * against the theoretical limits of how much data can securely be encrypted * with a single key using the supported encryption modes. The most obvious * limitation is that our risk of generating 2 equivalent 96 bit IVs increases * the more IVs we generate (which both GCM and CCM modes strictly forbid). * This risk actually grows surprisingly quickly over time according to the * Birthday Problem. With a total IV space of 2^(96 bits), and assuming we have * generated n IVs with a cryptographically secure RNG, the approximate * probability p(n) of a collision is given as: * * p(n) ~= e^(-n*(n-1)/(2*(2^96))) * * [http://www.math.cornell.edu/~mec/2008-2009/TianyiZheng/Birthday.html] * * Assuming that we want to ensure that p(n) never goes over 1 / 1 trillion * we must not write more than 398,065,730 blocks with the same encryption key. * Therefore, we rotate our keys after 400,000,000 blocks have been written by * generating a new random 64 bit salt for our HKDF encryption key generation * function. */ #define ZFS_KEY_MAX_SALT_USES_DEFAULT 400000000 #define ZFS_CURRENT_MAX_SALT_USES \ (MIN(zfs_key_max_salt_uses, ZFS_KEY_MAX_SALT_USES_DEFAULT)) static unsigned long zfs_key_max_salt_uses = ZFS_KEY_MAX_SALT_USES_DEFAULT; typedef struct blkptr_auth_buf { uint64_t bab_prop; /* blk_prop - portable mask */ uint8_t bab_mac[ZIO_DATA_MAC_LEN]; /* MAC from blk_cksum */ uint64_t bab_pad; /* reserved for future use */ } blkptr_auth_buf_t; const zio_crypt_info_t zio_crypt_table[ZIO_CRYPT_FUNCTIONS] = { {"", ZC_TYPE_NONE, 0, "inherit"}, {"", ZC_TYPE_NONE, 0, "on"}, {"", ZC_TYPE_NONE, 0, "off"}, {SUN_CKM_AES_CCM, ZC_TYPE_CCM, 16, "aes-128-ccm"}, {SUN_CKM_AES_CCM, ZC_TYPE_CCM, 24, "aes-192-ccm"}, {SUN_CKM_AES_CCM, ZC_TYPE_CCM, 32, "aes-256-ccm"}, {SUN_CKM_AES_GCM, ZC_TYPE_GCM, 16, "aes-128-gcm"}, {SUN_CKM_AES_GCM, ZC_TYPE_GCM, 24, "aes-192-gcm"}, {SUN_CKM_AES_GCM, ZC_TYPE_GCM, 32, "aes-256-gcm"} }; static void zio_crypt_key_destroy_early(zio_crypt_key_t *key) { rw_destroy(&key->zk_salt_lock); /* free crypto templates */ memset(&key->zk_session, 0, sizeof (key->zk_session)); /* zero out sensitive data */ memset(key, 0, sizeof (zio_crypt_key_t)); } void zio_crypt_key_destroy(zio_crypt_key_t *key) { freebsd_crypt_freesession(&key->zk_session); zio_crypt_key_destroy_early(key); } int zio_crypt_key_init(uint64_t crypt, zio_crypt_key_t *key) { int ret; crypto_mechanism_t mech __unused; uint_t keydata_len; const zio_crypt_info_t *ci = NULL; ASSERT3P(key, !=, NULL); ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS); ci = &zio_crypt_table[crypt]; if (ci->ci_crypt_type != ZC_TYPE_GCM && ci->ci_crypt_type != ZC_TYPE_CCM) return (ENOTSUP); keydata_len = zio_crypt_table[crypt].ci_keylen; memset(key, 0, sizeof (zio_crypt_key_t)); rw_init(&key->zk_salt_lock, NULL, RW_DEFAULT, NULL); /* fill keydata buffers and salt with random data */ ret = random_get_bytes((uint8_t *)&key->zk_guid, sizeof (uint64_t)); if (ret != 0) goto error; ret = random_get_bytes(key->zk_master_keydata, keydata_len); if (ret != 0) goto error; ret = random_get_bytes(key->zk_hmac_keydata, SHA512_HMAC_KEYLEN); if (ret != 0) goto error; ret = random_get_bytes(key->zk_salt, ZIO_DATA_SALT_LEN); if (ret != 0) goto error; /* derive the current key from the master key */ ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0, key->zk_salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata, keydata_len); if (ret != 0) goto error; /* initialize keys for the ICP */ key->zk_current_key.ck_data = key->zk_current_keydata; key->zk_current_key.ck_length = CRYPTO_BYTES2BITS(keydata_len); key->zk_hmac_key.ck_data = &key->zk_hmac_key; key->zk_hmac_key.ck_length = CRYPTO_BYTES2BITS(SHA512_HMAC_KEYLEN); ci = &zio_crypt_table[crypt]; if (ci->ci_crypt_type != ZC_TYPE_GCM && ci->ci_crypt_type != ZC_TYPE_CCM) return (ENOTSUP); ret = freebsd_crypt_newsession(&key->zk_session, ci, &key->zk_current_key); if (ret) goto error; key->zk_crypt = crypt; key->zk_version = ZIO_CRYPT_KEY_CURRENT_VERSION; key->zk_salt_count = 0; return (0); error: zio_crypt_key_destroy_early(key); return (ret); } static int zio_crypt_key_change_salt(zio_crypt_key_t *key) { int ret = 0; uint8_t salt[ZIO_DATA_SALT_LEN]; crypto_mechanism_t mech __unused; uint_t keydata_len = zio_crypt_table[key->zk_crypt].ci_keylen; /* generate a new salt */ ret = random_get_bytes(salt, ZIO_DATA_SALT_LEN); if (ret != 0) goto error; rw_enter(&key->zk_salt_lock, RW_WRITER); /* someone beat us to the salt rotation, just unlock and return */ if (key->zk_salt_count < ZFS_CURRENT_MAX_SALT_USES) goto out_unlock; /* derive the current key from the master key and the new salt */ ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0, salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata, keydata_len); if (ret != 0) goto out_unlock; /* assign the salt and reset the usage count */ memcpy(key->zk_salt, salt, ZIO_DATA_SALT_LEN); key->zk_salt_count = 0; freebsd_crypt_freesession(&key->zk_session); ret = freebsd_crypt_newsession(&key->zk_session, &zio_crypt_table[key->zk_crypt], &key->zk_current_key); if (ret != 0) goto out_unlock; rw_exit(&key->zk_salt_lock); return (0); out_unlock: rw_exit(&key->zk_salt_lock); error: return (ret); } /* See comment above zfs_key_max_salt_uses definition for details */ int zio_crypt_key_get_salt(zio_crypt_key_t *key, uint8_t *salt) { int ret; boolean_t salt_change; rw_enter(&key->zk_salt_lock, RW_READER); memcpy(salt, key->zk_salt, ZIO_DATA_SALT_LEN); salt_change = (atomic_inc_64_nv(&key->zk_salt_count) >= ZFS_CURRENT_MAX_SALT_USES); rw_exit(&key->zk_salt_lock); if (salt_change) { ret = zio_crypt_key_change_salt(key); if (ret != 0) goto error; } return (0); error: return (ret); } void *failed_decrypt_buf; int failed_decrypt_size; /* * This function handles all encryption and decryption in zfs. When * encrypting it expects puio to reference the plaintext and cuio to * reference the ciphertext. cuio must have enough space for the * ciphertext + room for a MAC. datalen should be the length of the * plaintext / ciphertext alone. */ /* * The implementation for FreeBSD's OpenCrypto. * * The big difference between ICP and FOC is that FOC uses a single * buffer for input and output. This means that (for AES-GCM, the * only one supported right now) the source must be copied into the * destination, and the destination must have the AAD, and the tag/MAC, * already associated with it. (Both implementations can use a uio.) * * Since the auth data is part of the iovec array, all we need to know * is the length: 0 means there's no AAD. * */ static int zio_do_crypt_uio_opencrypto(boolean_t encrypt, freebsd_crypt_session_t *sess, uint64_t crypt, crypto_key_t *key, uint8_t *ivbuf, uint_t datalen, zfs_uio_t *uio, uint_t auth_len) { const zio_crypt_info_t *ci = &zio_crypt_table[crypt]; if (ci->ci_crypt_type != ZC_TYPE_GCM && ci->ci_crypt_type != ZC_TYPE_CCM) return (ENOTSUP); int ret = freebsd_crypt_uio(encrypt, sess, ci, uio, key, ivbuf, datalen, auth_len); if (ret != 0) { #ifdef FCRYPTO_DEBUG printf("%s(%d): Returning error %s\n", __FUNCTION__, __LINE__, encrypt ? "EIO" : "ECKSUM"); #endif ret = SET_ERROR(encrypt ? EIO : ECKSUM); } return (ret); } int zio_crypt_key_wrap(crypto_key_t *cwkey, zio_crypt_key_t *key, uint8_t *iv, uint8_t *mac, uint8_t *keydata_out, uint8_t *hmac_keydata_out) { int ret; uint64_t aad[3]; /* * With OpenCrypto in FreeBSD, the same buffer is used for * input and output. Also, the AAD (for AES-GMC at least) * needs to logically go in front. */ zfs_uio_t cuio; struct uio cuio_s; iovec_t iovecs[4]; uint64_t crypt = key->zk_crypt; uint_t enc_len, keydata_len, aad_len; ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS); zfs_uio_init(&cuio, &cuio_s); keydata_len = zio_crypt_table[crypt].ci_keylen; /* generate iv for wrapping the master and hmac key */ ret = random_get_pseudo_bytes(iv, WRAPPING_IV_LEN); if (ret != 0) goto error; /* * Since we only support one buffer, we need to copy * the plain text (source) to the cipher buffer (dest). * We set iovecs[0] -- the authentication data -- below. */ memcpy(keydata_out, key->zk_master_keydata, keydata_len); memcpy(hmac_keydata_out, key->zk_hmac_keydata, SHA512_HMAC_KEYLEN); iovecs[1].iov_base = keydata_out; iovecs[1].iov_len = keydata_len; iovecs[2].iov_base = hmac_keydata_out; iovecs[2].iov_len = SHA512_HMAC_KEYLEN; iovecs[3].iov_base = mac; iovecs[3].iov_len = WRAPPING_MAC_LEN; /* * Although we don't support writing to the old format, we do * support rewrapping the key so that the user can move and * quarantine datasets on the old format. */ if (key->zk_version == 0) { aad_len = sizeof (uint64_t); aad[0] = LE_64(key->zk_guid); } else { ASSERT3U(key->zk_version, ==, ZIO_CRYPT_KEY_CURRENT_VERSION); aad_len = sizeof (uint64_t) * 3; aad[0] = LE_64(key->zk_guid); aad[1] = LE_64(crypt); aad[2] = LE_64(key->zk_version); } iovecs[0].iov_base = aad; iovecs[0].iov_len = aad_len; enc_len = zio_crypt_table[crypt].ci_keylen + SHA512_HMAC_KEYLEN; GET_UIO_STRUCT(&cuio)->uio_iov = iovecs; zfs_uio_iovcnt(&cuio) = 4; zfs_uio_segflg(&cuio) = UIO_SYSSPACE; /* encrypt the keys and store the resulting ciphertext and mac */ ret = zio_do_crypt_uio_opencrypto(B_TRUE, NULL, crypt, cwkey, iv, enc_len, &cuio, aad_len); if (ret != 0) goto error; return (0); error: return (ret); } int zio_crypt_key_unwrap(crypto_key_t *cwkey, uint64_t crypt, uint64_t version, uint64_t guid, uint8_t *keydata, uint8_t *hmac_keydata, uint8_t *iv, uint8_t *mac, zio_crypt_key_t *key) { int ret; uint64_t aad[3]; /* * With OpenCrypto in FreeBSD, the same buffer is used for * input and output. Also, the AAD (for AES-GMC at least) * needs to logically go in front. */ zfs_uio_t cuio; struct uio cuio_s; iovec_t iovecs[4]; void *src, *dst; uint_t enc_len, keydata_len, aad_len; ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS); keydata_len = zio_crypt_table[crypt].ci_keylen; rw_init(&key->zk_salt_lock, NULL, RW_DEFAULT, NULL); zfs_uio_init(&cuio, &cuio_s); /* * Since we only support one buffer, we need to copy * the encrypted buffer (source) to the plain buffer * (dest). We set iovecs[0] -- the authentication data -- * below. */ dst = key->zk_master_keydata; src = keydata; memcpy(dst, src, keydata_len); dst = key->zk_hmac_keydata; src = hmac_keydata; memcpy(dst, src, SHA512_HMAC_KEYLEN); iovecs[1].iov_base = key->zk_master_keydata; iovecs[1].iov_len = keydata_len; iovecs[2].iov_base = key->zk_hmac_keydata; iovecs[2].iov_len = SHA512_HMAC_KEYLEN; iovecs[3].iov_base = mac; iovecs[3].iov_len = WRAPPING_MAC_LEN; if (version == 0) { aad_len = sizeof (uint64_t); aad[0] = LE_64(guid); } else { ASSERT3U(version, ==, ZIO_CRYPT_KEY_CURRENT_VERSION); aad_len = sizeof (uint64_t) * 3; aad[0] = LE_64(guid); aad[1] = LE_64(crypt); aad[2] = LE_64(version); } enc_len = keydata_len + SHA512_HMAC_KEYLEN; iovecs[0].iov_base = aad; iovecs[0].iov_len = aad_len; GET_UIO_STRUCT(&cuio)->uio_iov = iovecs; zfs_uio_iovcnt(&cuio) = 4; zfs_uio_segflg(&cuio) = UIO_SYSSPACE; /* decrypt the keys and store the result in the output buffers */ ret = zio_do_crypt_uio_opencrypto(B_FALSE, NULL, crypt, cwkey, iv, enc_len, &cuio, aad_len); if (ret != 0) goto error; /* generate a fresh salt */ ret = random_get_bytes(key->zk_salt, ZIO_DATA_SALT_LEN); if (ret != 0) goto error; /* derive the current key from the master key */ ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0, key->zk_salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata, keydata_len); if (ret != 0) goto error; /* initialize keys for ICP */ key->zk_current_key.ck_data = key->zk_current_keydata; key->zk_current_key.ck_length = CRYPTO_BYTES2BITS(keydata_len); key->zk_hmac_key.ck_data = key->zk_hmac_keydata; key->zk_hmac_key.ck_length = CRYPTO_BYTES2BITS(SHA512_HMAC_KEYLEN); ret = freebsd_crypt_newsession(&key->zk_session, &zio_crypt_table[crypt], &key->zk_current_key); if (ret != 0) goto error; key->zk_crypt = crypt; key->zk_version = version; key->zk_guid = guid; key->zk_salt_count = 0; return (0); error: zio_crypt_key_destroy_early(key); return (ret); } int zio_crypt_generate_iv(uint8_t *ivbuf) { int ret; /* randomly generate the IV */ ret = random_get_pseudo_bytes(ivbuf, ZIO_DATA_IV_LEN); if (ret != 0) goto error; return (0); error: memset(ivbuf, 0, ZIO_DATA_IV_LEN); return (ret); } int zio_crypt_do_hmac(zio_crypt_key_t *key, uint8_t *data, uint_t datalen, uint8_t *digestbuf, uint_t digestlen) { uint8_t raw_digestbuf[SHA512_DIGEST_LENGTH]; ASSERT3U(digestlen, <=, SHA512_DIGEST_LENGTH); crypto_mac(&key->zk_hmac_key, data, datalen, raw_digestbuf, SHA512_DIGEST_LENGTH); memcpy(digestbuf, raw_digestbuf, digestlen); return (0); } int zio_crypt_generate_iv_salt_dedup(zio_crypt_key_t *key, uint8_t *data, uint_t datalen, uint8_t *ivbuf, uint8_t *salt) { int ret; uint8_t digestbuf[SHA512_DIGEST_LENGTH]; ret = zio_crypt_do_hmac(key, data, datalen, digestbuf, SHA512_DIGEST_LENGTH); if (ret != 0) return (ret); memcpy(salt, digestbuf, ZIO_DATA_SALT_LEN); memcpy(ivbuf, digestbuf + ZIO_DATA_SALT_LEN, ZIO_DATA_IV_LEN); return (0); } /* * The following functions are used to encode and decode encryption parameters * into blkptr_t and zil_header_t. The ICP wants to use these parameters as * byte strings, which normally means that these strings would not need to deal * with byteswapping at all. However, both blkptr_t and zil_header_t may be * byteswapped by lower layers and so we must "undo" that byteswap here upon * decoding and encoding in a non-native byteorder. These functions require * that the byteorder bit is correct before being called. */ void zio_crypt_encode_params_bp(blkptr_t *bp, uint8_t *salt, uint8_t *iv) { uint64_t val64; uint32_t val32; ASSERT(BP_IS_ENCRYPTED(bp)); if (!BP_SHOULD_BYTESWAP(bp)) { memcpy(&bp->blk_dva[2].dva_word[0], salt, sizeof (uint64_t)); memcpy(&bp->blk_dva[2].dva_word[1], iv, sizeof (uint64_t)); memcpy(&val32, iv + sizeof (uint64_t), sizeof (uint32_t)); BP_SET_IV2(bp, val32); } else { memcpy(&val64, salt, sizeof (uint64_t)); bp->blk_dva[2].dva_word[0] = BSWAP_64(val64); memcpy(&val64, iv, sizeof (uint64_t)); bp->blk_dva[2].dva_word[1] = BSWAP_64(val64); memcpy(&val32, iv + sizeof (uint64_t), sizeof (uint32_t)); BP_SET_IV2(bp, BSWAP_32(val32)); } } void zio_crypt_decode_params_bp(const blkptr_t *bp, uint8_t *salt, uint8_t *iv) { uint64_t val64; uint32_t val32; ASSERT(BP_IS_PROTECTED(bp)); /* for convenience, so callers don't need to check */ if (BP_IS_AUTHENTICATED(bp)) { memset(salt, 0, ZIO_DATA_SALT_LEN); memset(iv, 0, ZIO_DATA_IV_LEN); return; } if (!BP_SHOULD_BYTESWAP(bp)) { memcpy(salt, &bp->blk_dva[2].dva_word[0], sizeof (uint64_t)); memcpy(iv, &bp->blk_dva[2].dva_word[1], sizeof (uint64_t)); val32 = (uint32_t)BP_GET_IV2(bp); memcpy(iv + sizeof (uint64_t), &val32, sizeof (uint32_t)); } else { val64 = BSWAP_64(bp->blk_dva[2].dva_word[0]); memcpy(salt, &val64, sizeof (uint64_t)); val64 = BSWAP_64(bp->blk_dva[2].dva_word[1]); memcpy(iv, &val64, sizeof (uint64_t)); val32 = BSWAP_32((uint32_t)BP_GET_IV2(bp)); memcpy(iv + sizeof (uint64_t), &val32, sizeof (uint32_t)); } } void zio_crypt_encode_mac_bp(blkptr_t *bp, uint8_t *mac) { uint64_t val64; ASSERT(BP_USES_CRYPT(bp)); ASSERT3U(BP_GET_TYPE(bp), !=, DMU_OT_OBJSET); if (!BP_SHOULD_BYTESWAP(bp)) { memcpy(&bp->blk_cksum.zc_word[2], mac, sizeof (uint64_t)); memcpy(&bp->blk_cksum.zc_word[3], mac + sizeof (uint64_t), sizeof (uint64_t)); } else { memcpy(&val64, mac, sizeof (uint64_t)); bp->blk_cksum.zc_word[2] = BSWAP_64(val64); memcpy(&val64, mac + sizeof (uint64_t), sizeof (uint64_t)); bp->blk_cksum.zc_word[3] = BSWAP_64(val64); } } void zio_crypt_decode_mac_bp(const blkptr_t *bp, uint8_t *mac) { uint64_t val64; ASSERT(BP_USES_CRYPT(bp) || BP_IS_HOLE(bp)); /* for convenience, so callers don't need to check */ if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { memset(mac, 0, ZIO_DATA_MAC_LEN); return; } if (!BP_SHOULD_BYTESWAP(bp)) { memcpy(mac, &bp->blk_cksum.zc_word[2], sizeof (uint64_t)); memcpy(mac + sizeof (uint64_t), &bp->blk_cksum.zc_word[3], sizeof (uint64_t)); } else { val64 = BSWAP_64(bp->blk_cksum.zc_word[2]); memcpy(mac, &val64, sizeof (uint64_t)); val64 = BSWAP_64(bp->blk_cksum.zc_word[3]); memcpy(mac + sizeof (uint64_t), &val64, sizeof (uint64_t)); } } void zio_crypt_encode_mac_zil(void *data, uint8_t *mac) { zil_chain_t *zilc = data; memcpy(&zilc->zc_eck.zec_cksum.zc_word[2], mac, sizeof (uint64_t)); memcpy(&zilc->zc_eck.zec_cksum.zc_word[3], mac + sizeof (uint64_t), sizeof (uint64_t)); } void zio_crypt_decode_mac_zil(const void *data, uint8_t *mac) { /* * The ZIL MAC is embedded in the block it protects, which will * not have been byteswapped by the time this function has been called. * As a result, we don't need to worry about byteswapping the MAC. */ const zil_chain_t *zilc = data; memcpy(mac, &zilc->zc_eck.zec_cksum.zc_word[2], sizeof (uint64_t)); memcpy(mac + sizeof (uint64_t), &zilc->zc_eck.zec_cksum.zc_word[3], sizeof (uint64_t)); } /* * This routine takes a block of dnodes (src_abd) and copies only the bonus * buffers to the same offsets in the dst buffer. datalen should be the size * of both the src_abd and the dst buffer (not just the length of the bonus * buffers). */ void zio_crypt_copy_dnode_bonus(abd_t *src_abd, uint8_t *dst, uint_t datalen) { uint_t i, max_dnp = datalen >> DNODE_SHIFT; uint8_t *src; dnode_phys_t *dnp, *sdnp, *ddnp; src = abd_borrow_buf_copy(src_abd, datalen); sdnp = (dnode_phys_t *)src; ddnp = (dnode_phys_t *)dst; for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) { dnp = &sdnp[i]; if (dnp->dn_type != DMU_OT_NONE && DMU_OT_IS_ENCRYPTED(dnp->dn_bonustype) && dnp->dn_bonuslen != 0) { memcpy(DN_BONUS(&ddnp[i]), DN_BONUS(dnp), DN_MAX_BONUS_LEN(dnp)); } } abd_return_buf(src_abd, src, datalen); } /* * This function decides what fields from blk_prop are included in * the on-disk various MAC algorithms. */ static void zio_crypt_bp_zero_nonportable_blkprop(blkptr_t *bp, uint64_t version) { int avoidlint = SPA_MINBLOCKSIZE; /* * Version 0 did not properly zero out all non-portable fields * as it should have done. We maintain this code so that we can * do read-only imports of pools on this version. */ if (version == 0) { BP_SET_DEDUP(bp, 0); BP_SET_CHECKSUM(bp, 0); BP_SET_PSIZE(bp, avoidlint); return; } ASSERT3U(version, ==, ZIO_CRYPT_KEY_CURRENT_VERSION); /* * The hole_birth feature might set these fields even if this bp * is a hole. We zero them out here to guarantee that raw sends * will function with or without the feature. */ if (BP_IS_HOLE(bp)) { bp->blk_prop = 0ULL; return; } /* * At L0 we want to verify these fields to ensure that data blocks * can not be reinterpreted. For instance, we do not want an attacker * to trick us into returning raw lz4 compressed data to the user * by modifying the compression bits. At higher levels, we cannot * enforce this policy since raw sends do not convey any information * about indirect blocks, so these values might be different on the * receive side. Fortunately, this does not open any new attack * vectors, since any alterations that can be made to a higher level * bp must still verify the correct order of the layer below it. */ if (BP_GET_LEVEL(bp) != 0) { BP_SET_BYTEORDER(bp, 0); BP_SET_COMPRESS(bp, 0); /* * psize cannot be set to zero or it will trigger * asserts, but the value doesn't really matter as * long as it is constant. */ BP_SET_PSIZE(bp, avoidlint); } BP_SET_DEDUP(bp, 0); BP_SET_CHECKSUM(bp, 0); } static void zio_crypt_bp_auth_init(uint64_t version, boolean_t should_bswap, blkptr_t *bp, blkptr_auth_buf_t *bab, uint_t *bab_len) { blkptr_t tmpbp = *bp; if (should_bswap) byteswap_uint64_array(&tmpbp, sizeof (blkptr_t)); ASSERT(BP_USES_CRYPT(&tmpbp) || BP_IS_HOLE(&tmpbp)); ASSERT0(BP_IS_EMBEDDED(&tmpbp)); zio_crypt_decode_mac_bp(&tmpbp, bab->bab_mac); /* * We always MAC blk_prop in LE to ensure portability. This * must be done after decoding the mac, since the endianness * will get zero'd out here. */ zio_crypt_bp_zero_nonportable_blkprop(&tmpbp, version); bab->bab_prop = LE_64(tmpbp.blk_prop); bab->bab_pad = 0ULL; /* version 0 did not include the padding */ *bab_len = sizeof (blkptr_auth_buf_t); if (version == 0) *bab_len -= sizeof (uint64_t); } static int zio_crypt_bp_do_hmac_updates(crypto_context_t ctx, uint64_t version, boolean_t should_bswap, blkptr_t *bp) { uint_t bab_len; blkptr_auth_buf_t bab; zio_crypt_bp_auth_init(version, should_bswap, bp, &bab, &bab_len); crypto_mac_update(ctx, &bab, bab_len); return (0); } static void zio_crypt_bp_do_indrect_checksum_updates(SHA2_CTX *ctx, uint64_t version, boolean_t should_bswap, blkptr_t *bp) { uint_t bab_len; blkptr_auth_buf_t bab; zio_crypt_bp_auth_init(version, should_bswap, bp, &bab, &bab_len); SHA2Update(ctx, &bab, bab_len); } static void zio_crypt_bp_do_aad_updates(uint8_t **aadp, uint_t *aad_len, uint64_t version, boolean_t should_bswap, blkptr_t *bp) { uint_t bab_len; blkptr_auth_buf_t bab; zio_crypt_bp_auth_init(version, should_bswap, bp, &bab, &bab_len); memcpy(*aadp, &bab, bab_len); *aadp += bab_len; *aad_len += bab_len; } static int zio_crypt_do_dnode_hmac_updates(crypto_context_t ctx, uint64_t version, boolean_t should_bswap, dnode_phys_t *dnp) { int ret, i; dnode_phys_t *adnp; boolean_t le_bswap = (should_bswap == ZFS_HOST_BYTEORDER); uint8_t tmp_dncore[offsetof(dnode_phys_t, dn_blkptr)]; /* authenticate the core dnode (masking out non-portable bits) */ memcpy(tmp_dncore, dnp, sizeof (tmp_dncore)); adnp = (dnode_phys_t *)tmp_dncore; if (le_bswap) { adnp->dn_datablkszsec = BSWAP_16(adnp->dn_datablkszsec); adnp->dn_bonuslen = BSWAP_16(adnp->dn_bonuslen); adnp->dn_maxblkid = BSWAP_64(adnp->dn_maxblkid); adnp->dn_used = BSWAP_64(adnp->dn_used); } adnp->dn_flags &= DNODE_CRYPT_PORTABLE_FLAGS_MASK; adnp->dn_used = 0; crypto_mac_update(ctx, adnp, sizeof (tmp_dncore)); for (i = 0; i < dnp->dn_nblkptr; i++) { ret = zio_crypt_bp_do_hmac_updates(ctx, version, should_bswap, &dnp->dn_blkptr[i]); if (ret != 0) goto error; } if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { ret = zio_crypt_bp_do_hmac_updates(ctx, version, should_bswap, DN_SPILL_BLKPTR(dnp)); if (ret != 0) goto error; } return (0); error: return (ret); } /* * objset_phys_t blocks introduce a number of exceptions to the normal * authentication process. objset_phys_t's contain 2 separate HMACS for * protecting the integrity of their data. The portable_mac protects the * metadnode. This MAC can be sent with a raw send and protects against * reordering of data within the metadnode. The local_mac protects the user * accounting objects which are not sent from one system to another. * * In addition, objset blocks are the only blocks that can be modified and * written to disk without the key loaded under certain circumstances. During * zil_claim() we need to be able to update the zil_header_t to complete * claiming log blocks and during raw receives we need to write out the * portable_mac from the send file. Both of these actions are possible * because these fields are not protected by either MAC so neither one will * need to modify the MACs without the key. However, when the modified blocks * are written out they will be byteswapped into the host machine's native * endianness which will modify fields protected by the MAC. As a result, MAC * calculation for objset blocks works slightly differently from other block * types. Where other block types MAC the data in whatever endianness is * written to disk, objset blocks always MAC little endian version of their * values. In the code, should_bswap is the value from BP_SHOULD_BYTESWAP() * and le_bswap indicates whether a byteswap is needed to get this block * into little endian format. */ int zio_crypt_do_objset_hmacs(zio_crypt_key_t *key, void *data, uint_t datalen, boolean_t should_bswap, uint8_t *portable_mac, uint8_t *local_mac) { int ret; struct hmac_ctx hash_ctx; struct hmac_ctx *ctx = &hash_ctx; objset_phys_t *osp = data; uint64_t intval; boolean_t le_bswap = (should_bswap == ZFS_HOST_BYTEORDER); uint8_t raw_portable_mac[SHA512_DIGEST_LENGTH]; uint8_t raw_local_mac[SHA512_DIGEST_LENGTH]; /* calculate the portable MAC from the portable fields and metadnode */ crypto_mac_init(ctx, &key->zk_hmac_key); /* add in the os_type */ intval = (le_bswap) ? osp->os_type : BSWAP_64(osp->os_type); crypto_mac_update(ctx, &intval, sizeof (uint64_t)); /* add in the portable os_flags */ intval = osp->os_flags; if (should_bswap) intval = BSWAP_64(intval); intval &= OBJSET_CRYPT_PORTABLE_FLAGS_MASK; if (!ZFS_HOST_BYTEORDER) intval = BSWAP_64(intval); crypto_mac_update(ctx, &intval, sizeof (uint64_t)); /* add in fields from the metadnode */ ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version, should_bswap, &osp->os_meta_dnode); if (ret) goto error; crypto_mac_final(ctx, raw_portable_mac, SHA512_DIGEST_LENGTH); memcpy(portable_mac, raw_portable_mac, ZIO_OBJSET_MAC_LEN); /* * This is necessary here as we check next whether * OBJSET_FLAG_USERACCOUNTING_COMPLETE is set in order to * decide if the local_mac should be zeroed out. That flag will always * be set by dmu_objset_id_quota_upgrade_cb() and * dmu_objset_userspace_upgrade_cb() if useraccounting has been * completed. */ intval = osp->os_flags; if (should_bswap) intval = BSWAP_64(intval); boolean_t uacct_incomplete = !(intval & OBJSET_FLAG_USERACCOUNTING_COMPLETE); /* * The local MAC protects the user, group and project accounting. * If these objects are not present, the local MAC is zeroed out. */ if (uacct_incomplete || (datalen >= OBJSET_PHYS_SIZE_V3 && osp->os_userused_dnode.dn_type == DMU_OT_NONE && osp->os_groupused_dnode.dn_type == DMU_OT_NONE && osp->os_projectused_dnode.dn_type == DMU_OT_NONE) || (datalen >= OBJSET_PHYS_SIZE_V2 && osp->os_userused_dnode.dn_type == DMU_OT_NONE && osp->os_groupused_dnode.dn_type == DMU_OT_NONE) || (datalen <= OBJSET_PHYS_SIZE_V1)) { memset(local_mac, 0, ZIO_OBJSET_MAC_LEN); return (0); } /* calculate the local MAC from the userused and groupused dnodes */ crypto_mac_init(ctx, &key->zk_hmac_key); /* add in the non-portable os_flags */ intval = osp->os_flags; if (should_bswap) intval = BSWAP_64(intval); intval &= ~OBJSET_CRYPT_PORTABLE_FLAGS_MASK; if (!ZFS_HOST_BYTEORDER) intval = BSWAP_64(intval); crypto_mac_update(ctx, &intval, sizeof (uint64_t)); /* XXX check dnode type ... */ /* add in fields from the user accounting dnodes */ if (osp->os_userused_dnode.dn_type != DMU_OT_NONE) { ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version, should_bswap, &osp->os_userused_dnode); if (ret) goto error; } if (osp->os_groupused_dnode.dn_type != DMU_OT_NONE) { ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version, should_bswap, &osp->os_groupused_dnode); if (ret) goto error; } if (osp->os_projectused_dnode.dn_type != DMU_OT_NONE && datalen >= OBJSET_PHYS_SIZE_V3) { ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version, should_bswap, &osp->os_projectused_dnode); if (ret) goto error; } crypto_mac_final(ctx, raw_local_mac, SHA512_DIGEST_LENGTH); memcpy(local_mac, raw_local_mac, ZIO_OBJSET_MAC_LEN); return (0); error: memset(portable_mac, 0, ZIO_OBJSET_MAC_LEN); memset(local_mac, 0, ZIO_OBJSET_MAC_LEN); return (ret); } static void zio_crypt_destroy_uio(zfs_uio_t *uio) { if (GET_UIO_STRUCT(uio)->uio_iov) kmem_free(GET_UIO_STRUCT(uio)->uio_iov, zfs_uio_iovcnt(uio) * sizeof (iovec_t)); } /* * This function parses an uncompressed indirect block and returns a checksum * of all the portable fields from all of the contained bps. The portable * fields are the MAC and all of the fields from blk_prop except for the dedup, * checksum, and psize bits. For an explanation of the purpose of this, see * the comment block on object set authentication. */ static int zio_crypt_do_indirect_mac_checksum_impl(boolean_t generate, void *buf, uint_t datalen, uint64_t version, boolean_t byteswap, uint8_t *cksum) { blkptr_t *bp; int i, epb = datalen >> SPA_BLKPTRSHIFT; SHA2_CTX ctx; uint8_t digestbuf[SHA512_DIGEST_LENGTH]; /* checksum all of the MACs from the layer below */ SHA2Init(SHA512, &ctx); for (i = 0, bp = buf; i < epb; i++, bp++) { zio_crypt_bp_do_indrect_checksum_updates(&ctx, version, byteswap, bp); } SHA2Final(digestbuf, &ctx); if (generate) { memcpy(cksum, digestbuf, ZIO_DATA_MAC_LEN); return (0); } if (memcmp(digestbuf, cksum, ZIO_DATA_MAC_LEN) != 0) { #ifdef FCRYPTO_DEBUG printf("%s(%d): Setting ECKSUM\n", __FUNCTION__, __LINE__); #endif return (SET_ERROR(ECKSUM)); } return (0); } int zio_crypt_do_indirect_mac_checksum(boolean_t generate, void *buf, uint_t datalen, boolean_t byteswap, uint8_t *cksum) { int ret; /* * Unfortunately, callers of this function will not always have * easy access to the on-disk format version. This info is * normally found in the DSL Crypto Key, but the checksum-of-MACs * is expected to be verifiable even when the key isn't loaded. * Here, instead of doing a ZAP lookup for the version for each * zio, we simply try both existing formats. */ ret = zio_crypt_do_indirect_mac_checksum_impl(generate, buf, datalen, ZIO_CRYPT_KEY_CURRENT_VERSION, byteswap, cksum); if (ret == ECKSUM) { ASSERT(!generate); ret = zio_crypt_do_indirect_mac_checksum_impl(generate, buf, datalen, 0, byteswap, cksum); } return (ret); } int zio_crypt_do_indirect_mac_checksum_abd(boolean_t generate, abd_t *abd, uint_t datalen, boolean_t byteswap, uint8_t *cksum) { int ret; void *buf; buf = abd_borrow_buf_copy(abd, datalen); ret = zio_crypt_do_indirect_mac_checksum(generate, buf, datalen, byteswap, cksum); abd_return_buf(abd, buf, datalen); return (ret); } /* * Special case handling routine for encrypting / decrypting ZIL blocks. * We do not check for the older ZIL chain because the encryption feature * was not available before the newer ZIL chain was introduced. The goal * here is to encrypt everything except the blkptr_t of a lr_write_t and * the zil_chain_t header. Everything that is not encrypted is authenticated. */ /* * The OpenCrypto used in FreeBSD does not use separate source and * destination buffers; instead, the same buffer is used. Further, to * accommodate some of the drivers, the authbuf needs to be logically before * the data. This means that we need to copy the source to the destination, * and set up an extra iovec_t at the beginning to handle the authbuf. * It also means we'll only return one zfs_uio_t. */ static int zio_crypt_init_uios_zil(boolean_t encrypt, uint8_t *plainbuf, uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap, zfs_uio_t *puio, zfs_uio_t *out_uio, uint_t *enc_len, uint8_t **authbuf, uint_t *auth_len, boolean_t *no_crypt) { (void) puio; uint8_t *aadbuf = zio_buf_alloc(datalen); uint8_t *src, *dst, *slrp, *dlrp, *blkend, *aadp; iovec_t *dst_iovecs; zil_chain_t *zilc; lr_t *lr; uint64_t txtype, lr_len; uint_t crypt_len, nr_iovecs, vec; uint_t aad_len = 0, total_len = 0; if (encrypt) { src = plainbuf; dst = cipherbuf; } else { src = cipherbuf; dst = plainbuf; } memcpy(dst, src, datalen); /* Find the start and end record of the log block. */ zilc = (zil_chain_t *)src; slrp = src + sizeof (zil_chain_t); aadp = aadbuf; blkend = src + ((byteswap) ? BSWAP_64(zilc->zc_nused) : zilc->zc_nused); /* * Calculate the number of encrypted iovecs we will need. */ /* We need at least two iovecs -- one for the AAD, one for the MAC. */ nr_iovecs = 2; for (; slrp < blkend; slrp += lr_len) { lr = (lr_t *)slrp; if (byteswap) { txtype = BSWAP_64(lr->lrc_txtype); lr_len = BSWAP_64(lr->lrc_reclen); } else { txtype = lr->lrc_txtype; lr_len = lr->lrc_reclen; } nr_iovecs++; if (txtype == TX_WRITE && lr_len != sizeof (lr_write_t)) nr_iovecs++; } dst_iovecs = kmem_alloc(nr_iovecs * sizeof (iovec_t), KM_SLEEP); /* * Copy the plain zil header over and authenticate everything except * the checksum that will store our MAC. If we are writing the data * the embedded checksum will not have been calculated yet, so we don't * authenticate that. */ memcpy(aadp, src, sizeof (zil_chain_t) - sizeof (zio_eck_t)); aadp += sizeof (zil_chain_t) - sizeof (zio_eck_t); aad_len += sizeof (zil_chain_t) - sizeof (zio_eck_t); slrp = src + sizeof (zil_chain_t); dlrp = dst + sizeof (zil_chain_t); /* * Loop over records again, filling in iovecs. */ /* The first iovec will contain the authbuf. */ vec = 1; for (; slrp < blkend; slrp += lr_len, dlrp += lr_len) { lr = (lr_t *)slrp; if (!byteswap) { txtype = lr->lrc_txtype; lr_len = lr->lrc_reclen; } else { txtype = BSWAP_64(lr->lrc_txtype); lr_len = BSWAP_64(lr->lrc_reclen); } /* copy the common lr_t */ memcpy(dlrp, slrp, sizeof (lr_t)); memcpy(aadp, slrp, sizeof (lr_t)); aadp += sizeof (lr_t); aad_len += sizeof (lr_t); /* * If this is a TX_WRITE record we want to encrypt everything * except the bp if exists. If the bp does exist we want to * authenticate it. */ if (txtype == TX_WRITE) { crypt_len = sizeof (lr_write_t) - sizeof (lr_t) - sizeof (blkptr_t); dst_iovecs[vec].iov_base = (char *)dlrp + sizeof (lr_t); dst_iovecs[vec].iov_len = crypt_len; /* copy the bp now since it will not be encrypted */ memcpy(dlrp + sizeof (lr_write_t) - sizeof (blkptr_t), slrp + sizeof (lr_write_t) - sizeof (blkptr_t), sizeof (blkptr_t)); memcpy(aadp, slrp + sizeof (lr_write_t) - sizeof (blkptr_t), sizeof (blkptr_t)); aadp += sizeof (blkptr_t); aad_len += sizeof (blkptr_t); vec++; total_len += crypt_len; if (lr_len != sizeof (lr_write_t)) { crypt_len = lr_len - sizeof (lr_write_t); dst_iovecs[vec].iov_base = (char *) dlrp + sizeof (lr_write_t); dst_iovecs[vec].iov_len = crypt_len; vec++; total_len += crypt_len; } } else { crypt_len = lr_len - sizeof (lr_t); dst_iovecs[vec].iov_base = (char *)dlrp + sizeof (lr_t); dst_iovecs[vec].iov_len = crypt_len; vec++; total_len += crypt_len; } } /* The last iovec will contain the MAC. */ ASSERT3U(vec, ==, nr_iovecs - 1); /* AAD */ dst_iovecs[0].iov_base = aadbuf; dst_iovecs[0].iov_len = aad_len; /* MAC */ dst_iovecs[vec].iov_base = 0; dst_iovecs[vec].iov_len = 0; *no_crypt = (vec == 1); *enc_len = total_len; *authbuf = aadbuf; *auth_len = aad_len; GET_UIO_STRUCT(out_uio)->uio_iov = dst_iovecs; zfs_uio_iovcnt(out_uio) = nr_iovecs; return (0); } /* * Special case handling routine for encrypting / decrypting dnode blocks. */ static int zio_crypt_init_uios_dnode(boolean_t encrypt, uint64_t version, uint8_t *plainbuf, uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap, zfs_uio_t *puio, zfs_uio_t *out_uio, uint_t *enc_len, uint8_t **authbuf, uint_t *auth_len, boolean_t *no_crypt) { uint8_t *aadbuf = zio_buf_alloc(datalen); uint8_t *src, *dst, *aadp; dnode_phys_t *dnp, *adnp, *sdnp, *ddnp; iovec_t *dst_iovecs; uint_t nr_iovecs, crypt_len, vec; uint_t aad_len = 0, total_len = 0; uint_t i, j, max_dnp = datalen >> DNODE_SHIFT; if (encrypt) { src = plainbuf; dst = cipherbuf; } else { src = cipherbuf; dst = plainbuf; } memcpy(dst, src, datalen); sdnp = (dnode_phys_t *)src; ddnp = (dnode_phys_t *)dst; aadp = aadbuf; /* * Count the number of iovecs we will need to do the encryption by * counting the number of bonus buffers that need to be encrypted. */ /* We need at least two iovecs -- one for the AAD, one for the MAC. */ nr_iovecs = 2; for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) { /* * This block may still be byteswapped. However, all of the * values we use are either uint8_t's (for which byteswapping * is a noop) or a * != 0 check, which will work regardless * of whether or not we byteswap. */ if (sdnp[i].dn_type != DMU_OT_NONE && DMU_OT_IS_ENCRYPTED(sdnp[i].dn_bonustype) && sdnp[i].dn_bonuslen != 0) { nr_iovecs++; } } dst_iovecs = kmem_alloc(nr_iovecs * sizeof (iovec_t), KM_SLEEP); /* * Iterate through the dnodes again, this time filling in the uios * we allocated earlier. We also concatenate any data we want to * authenticate onto aadbuf. */ /* The first iovec will contain the authbuf. */ vec = 1; for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) { dnp = &sdnp[i]; /* copy over the core fields and blkptrs (kept as plaintext) */ memcpy(&ddnp[i], dnp, (uint8_t *)DN_BONUS(dnp) - (uint8_t *)dnp); if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { memcpy(DN_SPILL_BLKPTR(&ddnp[i]), DN_SPILL_BLKPTR(dnp), sizeof (blkptr_t)); } /* * Handle authenticated data. We authenticate everything in * the dnode that can be brought over when we do a raw send. * This includes all of the core fields as well as the MACs * stored in the bp checksums and all of the portable bits * from blk_prop. We include the dnode padding here in case it * ever gets used in the future. Some dn_flags and dn_used are * not portable so we mask those out values out of the * authenticated data. */ crypt_len = offsetof(dnode_phys_t, dn_blkptr); memcpy(aadp, dnp, crypt_len); adnp = (dnode_phys_t *)aadp; adnp->dn_flags &= DNODE_CRYPT_PORTABLE_FLAGS_MASK; adnp->dn_used = 0; aadp += crypt_len; aad_len += crypt_len; for (j = 0; j < dnp->dn_nblkptr; j++) { zio_crypt_bp_do_aad_updates(&aadp, &aad_len, version, byteswap, &dnp->dn_blkptr[j]); } if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { zio_crypt_bp_do_aad_updates(&aadp, &aad_len, version, byteswap, DN_SPILL_BLKPTR(dnp)); } /* * If this bonus buffer needs to be encrypted, we prepare an * iovec_t. The encryption / decryption functions will fill * this in for us with the encrypted or decrypted data. * Otherwise we add the bonus buffer to the authenticated * data buffer and copy it over to the destination. The * encrypted iovec extends to DN_MAX_BONUS_LEN(dnp) so that * we can guarantee alignment with the AES block size * (128 bits). */ crypt_len = DN_MAX_BONUS_LEN(dnp); if (dnp->dn_type != DMU_OT_NONE && DMU_OT_IS_ENCRYPTED(dnp->dn_bonustype) && dnp->dn_bonuslen != 0) { dst_iovecs[vec].iov_base = DN_BONUS(&ddnp[i]); dst_iovecs[vec].iov_len = crypt_len; vec++; total_len += crypt_len; } else { memcpy(DN_BONUS(&ddnp[i]), DN_BONUS(dnp), crypt_len); memcpy(aadp, DN_BONUS(dnp), crypt_len); aadp += crypt_len; aad_len += crypt_len; } } /* The last iovec will contain the MAC. */ ASSERT3U(vec, ==, nr_iovecs - 1); /* AAD */ dst_iovecs[0].iov_base = aadbuf; dst_iovecs[0].iov_len = aad_len; /* MAC */ dst_iovecs[vec].iov_base = 0; dst_iovecs[vec].iov_len = 0; *no_crypt = (vec == 1); *enc_len = total_len; *authbuf = aadbuf; *auth_len = aad_len; GET_UIO_STRUCT(out_uio)->uio_iov = dst_iovecs; zfs_uio_iovcnt(out_uio) = nr_iovecs; return (0); } static int zio_crypt_init_uios_normal(boolean_t encrypt, uint8_t *plainbuf, uint8_t *cipherbuf, uint_t datalen, zfs_uio_t *puio, zfs_uio_t *out_uio, uint_t *enc_len) { (void) puio; int ret; uint_t nr_plain = 1, nr_cipher = 2; iovec_t *plain_iovecs = NULL, *cipher_iovecs = NULL; void *src, *dst; cipher_iovecs = kmem_alloc(nr_cipher * sizeof (iovec_t), KM_SLEEP); if (!cipher_iovecs) { ret = SET_ERROR(ENOMEM); goto error; } memset(cipher_iovecs, 0, nr_cipher * sizeof (iovec_t)); if (encrypt) { src = plainbuf; dst = cipherbuf; } else { src = cipherbuf; dst = plainbuf; } memcpy(dst, src, datalen); cipher_iovecs[0].iov_base = dst; cipher_iovecs[0].iov_len = datalen; *enc_len = datalen; GET_UIO_STRUCT(out_uio)->uio_iov = cipher_iovecs; zfs_uio_iovcnt(out_uio) = nr_cipher; return (0); error: if (plain_iovecs != NULL) kmem_free(plain_iovecs, nr_plain * sizeof (iovec_t)); if (cipher_iovecs != NULL) kmem_free(cipher_iovecs, nr_cipher * sizeof (iovec_t)); *enc_len = 0; GET_UIO_STRUCT(out_uio)->uio_iov = NULL; zfs_uio_iovcnt(out_uio) = 0; return (ret); } /* * This function builds up the plaintext (puio) and ciphertext (cuio) uios so * that they can be used for encryption and decryption by zio_do_crypt_uio(). * Most blocks will use zio_crypt_init_uios_normal(), with ZIL and dnode blocks * requiring special handling to parse out pieces that are to be encrypted. The * authbuf is used by these special cases to store additional authenticated * data (AAD) for the encryption modes. */ static int zio_crypt_init_uios(boolean_t encrypt, uint64_t version, dmu_object_type_t ot, uint8_t *plainbuf, uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap, uint8_t *mac, zfs_uio_t *puio, zfs_uio_t *cuio, uint_t *enc_len, uint8_t **authbuf, uint_t *auth_len, boolean_t *no_crypt) { int ret; iovec_t *mac_iov; ASSERT(DMU_OT_IS_ENCRYPTED(ot) || ot == DMU_OT_NONE); /* route to handler */ switch (ot) { case DMU_OT_INTENT_LOG: ret = zio_crypt_init_uios_zil(encrypt, plainbuf, cipherbuf, datalen, byteswap, puio, cuio, enc_len, authbuf, auth_len, no_crypt); break; case DMU_OT_DNODE: ret = zio_crypt_init_uios_dnode(encrypt, version, plainbuf, cipherbuf, datalen, byteswap, puio, cuio, enc_len, authbuf, auth_len, no_crypt); break; default: ret = zio_crypt_init_uios_normal(encrypt, plainbuf, cipherbuf, datalen, puio, cuio, enc_len); *authbuf = NULL; *auth_len = 0; *no_crypt = B_FALSE; break; } if (ret != 0) goto error; /* populate the uios */ zfs_uio_segflg(cuio) = UIO_SYSSPACE; mac_iov = ((iovec_t *)&(GET_UIO_STRUCT(cuio)-> uio_iov[zfs_uio_iovcnt(cuio) - 1])); mac_iov->iov_base = (void *)mac; mac_iov->iov_len = ZIO_DATA_MAC_LEN; return (0); error: return (ret); } void *failed_decrypt_buf; int faile_decrypt_size; /* * Primary encryption / decryption entrypoint for zio data. */ int zio_do_crypt_data(boolean_t encrypt, zio_crypt_key_t *key, dmu_object_type_t ot, boolean_t byteswap, uint8_t *salt, uint8_t *iv, uint8_t *mac, uint_t datalen, uint8_t *plainbuf, uint8_t *cipherbuf, boolean_t *no_crypt) { int ret; boolean_t locked = B_FALSE; uint64_t crypt = key->zk_crypt; uint_t keydata_len = zio_crypt_table[crypt].ci_keylen; uint_t enc_len, auth_len; zfs_uio_t puio, cuio; struct uio puio_s, cuio_s; uint8_t enc_keydata[MASTER_KEY_MAX_LEN]; crypto_key_t tmp_ckey, *ckey = NULL; freebsd_crypt_session_t *tmpl = NULL; uint8_t *authbuf = NULL; zfs_uio_init(&puio, &puio_s); zfs_uio_init(&cuio, &cuio_s); memset(GET_UIO_STRUCT(&puio), 0, sizeof (struct uio)); memset(GET_UIO_STRUCT(&cuio), 0, sizeof (struct uio)); #ifdef FCRYPTO_DEBUG printf("%s(%s, %p, %p, %d, %p, %p, %u, %s, %p, %p, %p)\n", __FUNCTION__, encrypt ? "encrypt" : "decrypt", key, salt, ot, iv, mac, datalen, byteswap ? "byteswap" : "native_endian", plainbuf, cipherbuf, no_crypt); printf("\tkey = {"); for (int i = 0; i < key->zk_current_key.ck_length/8; i++) printf("%02x ", ((uint8_t *)key->zk_current_key.ck_data)[i]); printf("}\n"); #endif /* create uios for encryption */ ret = zio_crypt_init_uios(encrypt, key->zk_version, ot, plainbuf, cipherbuf, datalen, byteswap, mac, &puio, &cuio, &enc_len, &authbuf, &auth_len, no_crypt); if (ret != 0) return (ret); /* * If the needed key is the current one, just use it. Otherwise we * need to generate a temporary one from the given salt + master key. * If we are encrypting, we must return a copy of the current salt * so that it can be stored in the blkptr_t. */ rw_enter(&key->zk_salt_lock, RW_READER); locked = B_TRUE; if (memcmp(salt, key->zk_salt, ZIO_DATA_SALT_LEN) == 0) { ckey = &key->zk_current_key; tmpl = &key->zk_session; } else { rw_exit(&key->zk_salt_lock); locked = B_FALSE; ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0, salt, ZIO_DATA_SALT_LEN, enc_keydata, keydata_len); if (ret != 0) goto error; tmp_ckey.ck_data = enc_keydata; tmp_ckey.ck_length = CRYPTO_BYTES2BITS(keydata_len); ckey = &tmp_ckey; tmpl = NULL; } /* perform the encryption / decryption */ ret = zio_do_crypt_uio_opencrypto(encrypt, tmpl, key->zk_crypt, ckey, iv, enc_len, &cuio, auth_len); if (ret != 0) goto error; if (locked) { rw_exit(&key->zk_salt_lock); - locked = B_FALSE; } if (authbuf != NULL) zio_buf_free(authbuf, datalen); if (ckey == &tmp_ckey) memset(enc_keydata, 0, keydata_len); zio_crypt_destroy_uio(&puio); zio_crypt_destroy_uio(&cuio); return (0); error: if (!encrypt) { if (failed_decrypt_buf != NULL) kmem_free(failed_decrypt_buf, failed_decrypt_size); failed_decrypt_buf = kmem_alloc(datalen, KM_SLEEP); failed_decrypt_size = datalen; memcpy(failed_decrypt_buf, cipherbuf, datalen); } if (locked) rw_exit(&key->zk_salt_lock); if (authbuf != NULL) zio_buf_free(authbuf, datalen); if (ckey == &tmp_ckey) memset(enc_keydata, 0, keydata_len); zio_crypt_destroy_uio(&puio); zio_crypt_destroy_uio(&cuio); return (SET_ERROR(ret)); } /* * Simple wrapper around zio_do_crypt_data() to work with abd's instead of * linear buffers. */ int zio_do_crypt_abd(boolean_t encrypt, zio_crypt_key_t *key, dmu_object_type_t ot, boolean_t byteswap, uint8_t *salt, uint8_t *iv, uint8_t *mac, uint_t datalen, abd_t *pabd, abd_t *cabd, boolean_t *no_crypt) { int ret; void *ptmp, *ctmp; if (encrypt) { ptmp = abd_borrow_buf_copy(pabd, datalen); ctmp = abd_borrow_buf(cabd, datalen); } else { ptmp = abd_borrow_buf(pabd, datalen); ctmp = abd_borrow_buf_copy(cabd, datalen); } ret = zio_do_crypt_data(encrypt, key, ot, byteswap, salt, iv, mac, datalen, ptmp, ctmp, no_crypt); if (ret != 0) goto error; if (encrypt) { abd_return_buf(pabd, ptmp, datalen); abd_return_buf_copy(cabd, ctmp, datalen); } else { abd_return_buf_copy(pabd, ptmp, datalen); abd_return_buf(cabd, ctmp, datalen); } return (0); error: if (encrypt) { abd_return_buf(pabd, ptmp, datalen); abd_return_buf_copy(cabd, ctmp, datalen); } else { abd_return_buf_copy(pabd, ptmp, datalen); abd_return_buf(cabd, ctmp, datalen); } return (SET_ERROR(ret)); } #if defined(_KERNEL) && defined(HAVE_SPL) /* CSTYLED */ module_param(zfs_key_max_salt_uses, ulong, 0644); MODULE_PARM_DESC(zfs_key_max_salt_uses, "Max number of times a salt value " "can be used for generating encryption keys before it is rotated"); #endif diff --git a/module/os/linux/zfs/zfs_znode.c b/module/os/linux/zfs/zfs_znode.c index 73c21b6c00a8..a97955f4020a 100644 --- a/module/os/linux/zfs/zfs_znode.c +++ b/module/os/linux/zfs/zfs_znode.c @@ -1,2265 +1,2264 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2018 by Delphix. All rights reserved. */ /* Portions Copyright 2007 Jeremy Teo */ #ifdef _KERNEL #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #endif /* _KERNEL */ #include #include #include #include #include #include #include #include #include #include #include "zfs_prop.h" #include "zfs_comutil.h" /* * Functions needed for userland (ie: libzpool) are not put under * #ifdef_KERNEL; the rest of the functions have dependencies * (such as VFS logic) that will not compile easily in userland. */ #ifdef _KERNEL static kmem_cache_t *znode_cache = NULL; static kmem_cache_t *znode_hold_cache = NULL; unsigned int zfs_object_mutex_size = ZFS_OBJ_MTX_SZ; /* * This is used by the test suite so that it can delay znodes from being * freed in order to inspect the unlinked set. */ static int zfs_unlink_suspend_progress = 0; /* * This callback is invoked when acquiring a RL_WRITER or RL_APPEND lock on * z_rangelock. It will modify the offset and length of the lock to reflect * znode-specific information, and convert RL_APPEND to RL_WRITER. This is * called with the rangelock_t's rl_lock held, which avoids races. */ static void zfs_rangelock_cb(zfs_locked_range_t *new, void *arg) { znode_t *zp = arg; /* * If in append mode, convert to writer and lock starting at the * current end of file. */ if (new->lr_type == RL_APPEND) { new->lr_offset = zp->z_size; new->lr_type = RL_WRITER; } /* * If we need to grow the block size then lock the whole file range. */ uint64_t end_size = MAX(zp->z_size, new->lr_offset + new->lr_length); if (end_size > zp->z_blksz && (!ISP2(zp->z_blksz) || zp->z_blksz < ZTOZSB(zp)->z_max_blksz)) { new->lr_offset = 0; new->lr_length = UINT64_MAX; } } static int zfs_znode_cache_constructor(void *buf, void *arg, int kmflags) { (void) arg, (void) kmflags; znode_t *zp = buf; inode_init_once(ZTOI(zp)); list_link_init(&zp->z_link_node); mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL); rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL); rw_init(&zp->z_name_lock, NULL, RW_NOLOCKDEP, NULL); mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL); rw_init(&zp->z_xattr_lock, NULL, RW_DEFAULT, NULL); zfs_rangelock_init(&zp->z_rangelock, zfs_rangelock_cb, zp); zp->z_dirlocks = NULL; zp->z_acl_cached = NULL; zp->z_xattr_cached = NULL; zp->z_xattr_parent = 0; zp->z_sync_writes_cnt = 0; zp->z_async_writes_cnt = 0; return (0); } static void zfs_znode_cache_destructor(void *buf, void *arg) { (void) arg; znode_t *zp = buf; ASSERT(!list_link_active(&zp->z_link_node)); mutex_destroy(&zp->z_lock); rw_destroy(&zp->z_parent_lock); rw_destroy(&zp->z_name_lock); mutex_destroy(&zp->z_acl_lock); rw_destroy(&zp->z_xattr_lock); zfs_rangelock_fini(&zp->z_rangelock); ASSERT3P(zp->z_dirlocks, ==, NULL); ASSERT3P(zp->z_acl_cached, ==, NULL); ASSERT3P(zp->z_xattr_cached, ==, NULL); ASSERT0(atomic_load_32(&zp->z_sync_writes_cnt)); ASSERT0(atomic_load_32(&zp->z_async_writes_cnt)); } static int zfs_znode_hold_cache_constructor(void *buf, void *arg, int kmflags) { (void) arg, (void) kmflags; znode_hold_t *zh = buf; mutex_init(&zh->zh_lock, NULL, MUTEX_DEFAULT, NULL); zfs_refcount_create(&zh->zh_refcount); zh->zh_obj = ZFS_NO_OBJECT; return (0); } static void zfs_znode_hold_cache_destructor(void *buf, void *arg) { (void) arg; znode_hold_t *zh = buf; mutex_destroy(&zh->zh_lock); zfs_refcount_destroy(&zh->zh_refcount); } void zfs_znode_init(void) { /* * Initialize zcache. The KMC_SLAB hint is used in order that it be * backed by kmalloc() when on the Linux slab in order that any * wait_on_bit() operations on the related inode operate properly. */ ASSERT(znode_cache == NULL); znode_cache = kmem_cache_create("zfs_znode_cache", sizeof (znode_t), 0, zfs_znode_cache_constructor, zfs_znode_cache_destructor, NULL, NULL, NULL, KMC_SLAB); ASSERT(znode_hold_cache == NULL); znode_hold_cache = kmem_cache_create("zfs_znode_hold_cache", sizeof (znode_hold_t), 0, zfs_znode_hold_cache_constructor, zfs_znode_hold_cache_destructor, NULL, NULL, NULL, 0); } void zfs_znode_fini(void) { /* * Cleanup zcache */ if (znode_cache) kmem_cache_destroy(znode_cache); znode_cache = NULL; if (znode_hold_cache) kmem_cache_destroy(znode_hold_cache); znode_hold_cache = NULL; } /* * The zfs_znode_hold_enter() / zfs_znode_hold_exit() functions are used to * serialize access to a znode and its SA buffer while the object is being * created or destroyed. This kind of locking would normally reside in the * znode itself but in this case that's impossible because the znode and SA * buffer may not yet exist. Therefore the locking is handled externally * with an array of mutexes and AVLs trees which contain per-object locks. * * In zfs_znode_hold_enter() a per-object lock is created as needed, inserted * in to the correct AVL tree and finally the per-object lock is held. In * zfs_znode_hold_exit() the process is reversed. The per-object lock is * released, removed from the AVL tree and destroyed if there are no waiters. * * This scheme has two important properties: * * 1) No memory allocations are performed while holding one of the z_hold_locks. * This ensures evict(), which can be called from direct memory reclaim, will * never block waiting on a z_hold_locks which just happens to have hashed * to the same index. * * 2) All locks used to serialize access to an object are per-object and never * shared. This minimizes lock contention without creating a large number * of dedicated locks. * * On the downside it does require znode_lock_t structures to be frequently * allocated and freed. However, because these are backed by a kmem cache * and very short lived this cost is minimal. */ int zfs_znode_hold_compare(const void *a, const void *b) { const znode_hold_t *zh_a = (const znode_hold_t *)a; const znode_hold_t *zh_b = (const znode_hold_t *)b; return (TREE_CMP(zh_a->zh_obj, zh_b->zh_obj)); } static boolean_t __maybe_unused zfs_znode_held(zfsvfs_t *zfsvfs, uint64_t obj) { znode_hold_t *zh, search; int i = ZFS_OBJ_HASH(zfsvfs, obj); boolean_t held; search.zh_obj = obj; mutex_enter(&zfsvfs->z_hold_locks[i]); zh = avl_find(&zfsvfs->z_hold_trees[i], &search, NULL); held = (zh && MUTEX_HELD(&zh->zh_lock)) ? B_TRUE : B_FALSE; mutex_exit(&zfsvfs->z_hold_locks[i]); return (held); } static znode_hold_t * zfs_znode_hold_enter(zfsvfs_t *zfsvfs, uint64_t obj) { znode_hold_t *zh, *zh_new, search; int i = ZFS_OBJ_HASH(zfsvfs, obj); boolean_t found = B_FALSE; zh_new = kmem_cache_alloc(znode_hold_cache, KM_SLEEP); zh_new->zh_obj = obj; search.zh_obj = obj; mutex_enter(&zfsvfs->z_hold_locks[i]); zh = avl_find(&zfsvfs->z_hold_trees[i], &search, NULL); if (likely(zh == NULL)) { zh = zh_new; avl_add(&zfsvfs->z_hold_trees[i], zh); } else { ASSERT3U(zh->zh_obj, ==, obj); found = B_TRUE; } zfs_refcount_add(&zh->zh_refcount, NULL); mutex_exit(&zfsvfs->z_hold_locks[i]); if (found == B_TRUE) kmem_cache_free(znode_hold_cache, zh_new); ASSERT(MUTEX_NOT_HELD(&zh->zh_lock)); ASSERT3S(zfs_refcount_count(&zh->zh_refcount), >, 0); mutex_enter(&zh->zh_lock); return (zh); } static void zfs_znode_hold_exit(zfsvfs_t *zfsvfs, znode_hold_t *zh) { int i = ZFS_OBJ_HASH(zfsvfs, zh->zh_obj); boolean_t remove = B_FALSE; ASSERT(zfs_znode_held(zfsvfs, zh->zh_obj)); ASSERT3S(zfs_refcount_count(&zh->zh_refcount), >, 0); mutex_exit(&zh->zh_lock); mutex_enter(&zfsvfs->z_hold_locks[i]); if (zfs_refcount_remove(&zh->zh_refcount, NULL) == 0) { avl_remove(&zfsvfs->z_hold_trees[i], zh); remove = B_TRUE; } mutex_exit(&zfsvfs->z_hold_locks[i]); if (remove == B_TRUE) kmem_cache_free(znode_hold_cache, zh); } dev_t zfs_cmpldev(uint64_t dev) { return (dev); } static void zfs_znode_sa_init(zfsvfs_t *zfsvfs, znode_t *zp, dmu_buf_t *db, dmu_object_type_t obj_type, sa_handle_t *sa_hdl) { ASSERT(zfs_znode_held(zfsvfs, zp->z_id)); mutex_enter(&zp->z_lock); ASSERT(zp->z_sa_hdl == NULL); ASSERT(zp->z_acl_cached == NULL); if (sa_hdl == NULL) { VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, zp, SA_HDL_SHARED, &zp->z_sa_hdl)); } else { zp->z_sa_hdl = sa_hdl; sa_set_userp(sa_hdl, zp); } zp->z_is_sa = (obj_type == DMU_OT_SA) ? B_TRUE : B_FALSE; mutex_exit(&zp->z_lock); } void zfs_znode_dmu_fini(znode_t *zp) { ASSERT(zfs_znode_held(ZTOZSB(zp), zp->z_id) || zp->z_unlinked || RW_WRITE_HELD(&ZTOZSB(zp)->z_teardown_inactive_lock)); sa_handle_destroy(zp->z_sa_hdl); zp->z_sa_hdl = NULL; } /* * Called by new_inode() to allocate a new inode. */ int zfs_inode_alloc(struct super_block *sb, struct inode **ip) { znode_t *zp; zp = kmem_cache_alloc(znode_cache, KM_SLEEP); *ip = ZTOI(zp); return (0); } /* * Called in multiple places when an inode should be destroyed. */ void zfs_inode_destroy(struct inode *ip) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ZTOZSB(zp); mutex_enter(&zfsvfs->z_znodes_lock); if (list_link_active(&zp->z_link_node)) { list_remove(&zfsvfs->z_all_znodes, zp); zfsvfs->z_nr_znodes--; } mutex_exit(&zfsvfs->z_znodes_lock); if (zp->z_acl_cached) { zfs_acl_free(zp->z_acl_cached); zp->z_acl_cached = NULL; } if (zp->z_xattr_cached) { nvlist_free(zp->z_xattr_cached); zp->z_xattr_cached = NULL; } kmem_cache_free(znode_cache, zp); } static void zfs_inode_set_ops(zfsvfs_t *zfsvfs, struct inode *ip) { uint64_t rdev = 0; switch (ip->i_mode & S_IFMT) { case S_IFREG: ip->i_op = &zpl_inode_operations; ip->i_fop = &zpl_file_operations; ip->i_mapping->a_ops = &zpl_address_space_operations; break; case S_IFDIR: ip->i_op = &zpl_dir_inode_operations; ip->i_fop = &zpl_dir_file_operations; ITOZ(ip)->z_zn_prefetch = B_TRUE; break; case S_IFLNK: ip->i_op = &zpl_symlink_inode_operations; break; /* * rdev is only stored in a SA only for device files. */ case S_IFCHR: case S_IFBLK: (void) sa_lookup(ITOZ(ip)->z_sa_hdl, SA_ZPL_RDEV(zfsvfs), &rdev, sizeof (rdev)); zfs_fallthrough; case S_IFIFO: case S_IFSOCK: init_special_inode(ip, ip->i_mode, rdev); ip->i_op = &zpl_special_inode_operations; break; default: zfs_panic_recover("inode %llu has invalid mode: 0x%x\n", (u_longlong_t)ip->i_ino, ip->i_mode); /* Assume the inode is a file and attempt to continue */ ip->i_mode = S_IFREG | 0644; ip->i_op = &zpl_inode_operations; ip->i_fop = &zpl_file_operations; ip->i_mapping->a_ops = &zpl_address_space_operations; break; } } static void zfs_set_inode_flags(znode_t *zp, struct inode *ip) { /* * Linux and Solaris have different sets of file attributes, so we * restrict this conversion to the intersection of the two. */ #ifdef HAVE_INODE_SET_FLAGS unsigned int flags = 0; if (zp->z_pflags & ZFS_IMMUTABLE) flags |= S_IMMUTABLE; if (zp->z_pflags & ZFS_APPENDONLY) flags |= S_APPEND; inode_set_flags(ip, flags, S_IMMUTABLE|S_APPEND); #else if (zp->z_pflags & ZFS_IMMUTABLE) ip->i_flags |= S_IMMUTABLE; else ip->i_flags &= ~S_IMMUTABLE; if (zp->z_pflags & ZFS_APPENDONLY) ip->i_flags |= S_APPEND; else ip->i_flags &= ~S_APPEND; #endif } /* * Update the embedded inode given the znode. */ void zfs_znode_update_vfs(znode_t *zp) { zfsvfs_t *zfsvfs; struct inode *ip; uint32_t blksize; u_longlong_t i_blocks; ASSERT(zp != NULL); zfsvfs = ZTOZSB(zp); ip = ZTOI(zp); /* Skip .zfs control nodes which do not exist on disk. */ if (zfsctl_is_node(ip)) return; dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &blksize, &i_blocks); spin_lock(&ip->i_lock); ip->i_mode = zp->z_mode; ip->i_blocks = i_blocks; i_size_write(ip, zp->z_size); spin_unlock(&ip->i_lock); } /* * Construct a znode+inode and initialize. * * This does not do a call to dmu_set_user() that is * up to the caller to do, in case you don't want to * return the znode */ static znode_t * zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz, dmu_object_type_t obj_type, sa_handle_t *hdl) { znode_t *zp; struct inode *ip; uint64_t mode; uint64_t parent; uint64_t tmp_gen; uint64_t links; uint64_t z_uid, z_gid; uint64_t atime[2], mtime[2], ctime[2], btime[2]; uint64_t projid = ZFS_DEFAULT_PROJID; sa_bulk_attr_t bulk[12]; int count = 0; ASSERT(zfsvfs != NULL); ip = new_inode(zfsvfs->z_sb); if (ip == NULL) return (NULL); zp = ITOZ(ip); ASSERT(zp->z_dirlocks == NULL); ASSERT3P(zp->z_acl_cached, ==, NULL); ASSERT3P(zp->z_xattr_cached, ==, NULL); zp->z_unlinked = B_FALSE; zp->z_atime_dirty = B_FALSE; zp->z_is_mapped = B_FALSE; zp->z_is_ctldir = B_FALSE; zp->z_is_stale = B_FALSE; zp->z_suspended = B_FALSE; zp->z_sa_hdl = NULL; zp->z_mapcnt = 0; zp->z_id = db->db_object; zp->z_blksz = blksz; zp->z_seq = 0x7A4653; zp->z_sync_cnt = 0; zp->z_sync_writes_cnt = 0; zp->z_async_writes_cnt = 0; zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, &tmp_gen, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, &zp->z_size, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &z_uid, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &z_gid, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &btime, 16); if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count) != 0 || tmp_gen == 0 || (dmu_objset_projectquota_enabled(zfsvfs->z_os) && (zp->z_pflags & ZFS_PROJID) && sa_lookup(zp->z_sa_hdl, SA_ZPL_PROJID(zfsvfs), &projid, 8) != 0)) { if (hdl == NULL) sa_handle_destroy(zp->z_sa_hdl); zp->z_sa_hdl = NULL; goto error; } zp->z_projid = projid; zp->z_mode = ip->i_mode = mode; ip->i_generation = (uint32_t)tmp_gen; ip->i_blkbits = SPA_MINBLOCKSHIFT; set_nlink(ip, (uint32_t)links); zfs_uid_write(ip, z_uid); zfs_gid_write(ip, z_gid); zfs_set_inode_flags(zp, ip); /* Cache the xattr parent id */ if (zp->z_pflags & ZFS_XATTR) zp->z_xattr_parent = parent; ZFS_TIME_DECODE(&ip->i_atime, atime); ZFS_TIME_DECODE(&ip->i_mtime, mtime); ZFS_TIME_DECODE(&ip->i_ctime, ctime); ZFS_TIME_DECODE(&zp->z_btime, btime); ip->i_ino = zp->z_id; zfs_znode_update_vfs(zp); zfs_inode_set_ops(zfsvfs, ip); /* * The only way insert_inode_locked() can fail is if the ip->i_ino * number is already hashed for this super block. This can never * happen because the inode numbers map 1:1 with the object numbers. * * Exceptions include rolling back a mounted file system, either * from the zfs rollback or zfs recv command. * * Active inodes are unhashed during the rollback, but since zrele * can happen asynchronously, we can't guarantee they've been * unhashed. This can cause hash collisions in unlinked drain * processing so do not hash unlinked znodes. */ if (links > 0) VERIFY3S(insert_inode_locked(ip), ==, 0); mutex_enter(&zfsvfs->z_znodes_lock); list_insert_tail(&zfsvfs->z_all_znodes, zp); zfsvfs->z_nr_znodes++; mutex_exit(&zfsvfs->z_znodes_lock); if (links > 0) unlock_new_inode(ip); return (zp); error: iput(ip); return (NULL); } /* * Safely mark an inode dirty. Inodes which are part of a read-only * file system or snapshot may not be dirtied. */ void zfs_mark_inode_dirty(struct inode *ip) { zfsvfs_t *zfsvfs = ITOZSB(ip); if (zfs_is_readonly(zfsvfs) || dmu_objset_is_snapshot(zfsvfs->z_os)) return; mark_inode_dirty(ip); } static uint64_t empty_xattr; static uint64_t pad[4]; static zfs_acl_phys_t acl_phys; /* * Create a new DMU object to hold a zfs znode. * * IN: dzp - parent directory for new znode * vap - file attributes for new znode * tx - dmu transaction id for zap operations * cr - credentials of caller * flag - flags: * IS_ROOT_NODE - new object will be root * IS_TMPFILE - new object is of O_TMPFILE * IS_XATTR - new object is an attribute * acl_ids - ACL related attributes * * OUT: zpp - allocated znode (set to dzp if IS_ROOT_NODE) * */ void zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, uint_t flag, znode_t **zpp, zfs_acl_ids_t *acl_ids) { uint64_t crtime[2], atime[2], mtime[2], ctime[2]; uint64_t mode, size, links, parent, pflags; uint64_t projid = ZFS_DEFAULT_PROJID; uint64_t rdev = 0; zfsvfs_t *zfsvfs = ZTOZSB(dzp); dmu_buf_t *db; inode_timespec_t now; uint64_t gen, obj; int bonuslen; int dnodesize; sa_handle_t *sa_hdl; dmu_object_type_t obj_type; sa_bulk_attr_t *sa_attrs; int cnt = 0; zfs_acl_locator_cb_t locate = { 0 }; znode_hold_t *zh; if (zfsvfs->z_replay) { obj = vap->va_nodeid; now = vap->va_ctime; /* see zfs_replay_create() */ gen = vap->va_nblocks; /* ditto */ dnodesize = vap->va_fsid; /* ditto */ } else { obj = 0; gethrestime(&now); gen = dmu_tx_get_txg(tx); dnodesize = dmu_objset_dnodesize(zfsvfs->z_os); } if (dnodesize == 0) dnodesize = DNODE_MIN_SIZE; obj_type = zfsvfs->z_use_sa ? DMU_OT_SA : DMU_OT_ZNODE; bonuslen = (obj_type == DMU_OT_SA) ? DN_BONUS_SIZE(dnodesize) : ZFS_OLD_ZNODE_PHYS_SIZE; /* * Create a new DMU object. */ /* * There's currently no mechanism for pre-reading the blocks that will * be needed to allocate a new object, so we accept the small chance * that there will be an i/o error and we will fail one of the * assertions below. */ if (S_ISDIR(vap->va_mode)) { if (zfsvfs->z_replay) { VERIFY0(zap_create_claim_norm_dnsize(zfsvfs->z_os, obj, zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS, obj_type, bonuslen, dnodesize, tx)); } else { obj = zap_create_norm_dnsize(zfsvfs->z_os, zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS, obj_type, bonuslen, dnodesize, tx); } } else { if (zfsvfs->z_replay) { VERIFY0(dmu_object_claim_dnsize(zfsvfs->z_os, obj, DMU_OT_PLAIN_FILE_CONTENTS, 0, obj_type, bonuslen, dnodesize, tx)); } else { obj = dmu_object_alloc_dnsize(zfsvfs->z_os, DMU_OT_PLAIN_FILE_CONTENTS, 0, obj_type, bonuslen, dnodesize, tx); } } zh = zfs_znode_hold_enter(zfsvfs, obj); VERIFY0(sa_buf_hold(zfsvfs->z_os, obj, NULL, &db)); /* * If this is the root, fix up the half-initialized parent pointer * to reference the just-allocated physical data area. */ if (flag & IS_ROOT_NODE) { dzp->z_id = obj; } /* * If parent is an xattr, so am I. */ if (dzp->z_pflags & ZFS_XATTR) { flag |= IS_XATTR; } if (zfsvfs->z_use_fuids) pflags = ZFS_ARCHIVE | ZFS_AV_MODIFIED; else pflags = 0; if (S_ISDIR(vap->va_mode)) { size = 2; /* contents ("." and "..") */ links = 2; } else { size = 0; links = (flag & IS_TMPFILE) ? 0 : 1; } if (S_ISBLK(vap->va_mode) || S_ISCHR(vap->va_mode)) rdev = vap->va_rdev; parent = dzp->z_id; mode = acl_ids->z_mode; if (flag & IS_XATTR) pflags |= ZFS_XATTR; if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode)) { /* * With ZFS_PROJID flag, we can easily know whether there is * project ID stored on disk or not. See zfs_space_delta_cb(). */ if (obj_type != DMU_OT_ZNODE && dmu_objset_projectquota_enabled(zfsvfs->z_os)) pflags |= ZFS_PROJID; /* * Inherit project ID from parent if required. */ projid = zfs_inherit_projid(dzp); if (dzp->z_pflags & ZFS_PROJINHERIT) pflags |= ZFS_PROJINHERIT; } /* * No execs denied will be determined when zfs_mode_compute() is called. */ pflags |= acl_ids->z_aclp->z_hints & (ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|ZFS_ACL_AUTO_INHERIT| ZFS_ACL_DEFAULTED|ZFS_ACL_PROTECTED); ZFS_TIME_ENCODE(&now, crtime); ZFS_TIME_ENCODE(&now, ctime); if (vap->va_mask & ATTR_ATIME) { ZFS_TIME_ENCODE(&vap->va_atime, atime); } else { ZFS_TIME_ENCODE(&now, atime); } if (vap->va_mask & ATTR_MTIME) { ZFS_TIME_ENCODE(&vap->va_mtime, mtime); } else { ZFS_TIME_ENCODE(&now, mtime); } /* Now add in all of the "SA" attributes */ VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, NULL, SA_HDL_SHARED, &sa_hdl)); /* * Setup the array of attributes to be replaced/set on the new file * * order for DMU_OT_ZNODE is critical since it needs to be constructed * in the old znode_phys_t format. Don't change this ordering */ sa_attrs = kmem_alloc(sizeof (sa_bulk_attr_t) * ZPL_END, KM_SLEEP); if (obj_type == DMU_OT_ZNODE) { SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs), NULL, &gen, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs), NULL, &size, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8); } else { SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs), NULL, &size, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs), NULL, &gen, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), NULL, &acl_ids->z_fuid, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), NULL, &acl_ids->z_fgid, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs), NULL, &pflags, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16); } SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8); if (obj_type == DMU_OT_ZNODE) { SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_XATTR(zfsvfs), NULL, &empty_xattr, 8); } else if (dmu_objset_projectquota_enabled(zfsvfs->z_os) && pflags & ZFS_PROJID) { SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PROJID(zfsvfs), NULL, &projid, 8); } if (obj_type == DMU_OT_ZNODE || (S_ISBLK(vap->va_mode) || S_ISCHR(vap->va_mode))) { SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_RDEV(zfsvfs), NULL, &rdev, 8); } if (obj_type == DMU_OT_ZNODE) { SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs), NULL, &pflags, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), NULL, &acl_ids->z_fuid, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), NULL, &acl_ids->z_fgid, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PAD(zfsvfs), NULL, pad, sizeof (uint64_t) * 4); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ZNODE_ACL(zfsvfs), NULL, &acl_phys, sizeof (zfs_acl_phys_t)); } else if (acl_ids->z_aclp->z_version >= ZFS_ACL_VERSION_FUID) { SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_COUNT(zfsvfs), NULL, &acl_ids->z_aclp->z_acl_count, 8); locate.cb_aclp = acl_ids->z_aclp; SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_ACES(zfsvfs), zfs_acl_data_locator, &locate, acl_ids->z_aclp->z_acl_bytes); mode = zfs_mode_compute(mode, acl_ids->z_aclp, &pflags, acl_ids->z_fuid, acl_ids->z_fgid); } VERIFY(sa_replace_all_by_template(sa_hdl, sa_attrs, cnt, tx) == 0); if (!(flag & IS_ROOT_NODE)) { /* * The call to zfs_znode_alloc() may fail if memory is low * via the call path: alloc_inode() -> inode_init_always() -> * security_inode_alloc() -> inode_alloc_security(). Since * the existing code is written such that zfs_mknode() can * not fail retry until sufficient memory has been reclaimed. */ do { *zpp = zfs_znode_alloc(zfsvfs, db, 0, obj_type, sa_hdl); } while (*zpp == NULL); VERIFY(*zpp != NULL); VERIFY(dzp != NULL); } else { /* * If we are creating the root node, the "parent" we * passed in is the znode for the root. */ *zpp = dzp; (*zpp)->z_sa_hdl = sa_hdl; } (*zpp)->z_pflags = pflags; (*zpp)->z_mode = ZTOI(*zpp)->i_mode = mode; (*zpp)->z_dnodesize = dnodesize; (*zpp)->z_projid = projid; if (obj_type == DMU_OT_ZNODE || acl_ids->z_aclp->z_version < ZFS_ACL_VERSION_FUID) { VERIFY0(zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx)); } kmem_free(sa_attrs, sizeof (sa_bulk_attr_t) * ZPL_END); zfs_znode_hold_exit(zfsvfs, zh); } /* * Update in-core attributes. It is assumed the caller will be doing an * sa_bulk_update to push the changes out. */ void zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx) { xoptattr_t *xoap; boolean_t update_inode = B_FALSE; xoap = xva_getxoptattr(xvap); ASSERT(xoap); if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) { uint64_t times[2]; ZFS_TIME_ENCODE(&xoap->xoa_createtime, times); (void) sa_update(zp->z_sa_hdl, SA_ZPL_CRTIME(ZTOZSB(zp)), ×, sizeof (times), tx); XVA_SET_RTN(xvap, XAT_CREATETIME); } if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_READONLY); } if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_HIDDEN); } if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_SYSTEM); } if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_ARCHIVE); } if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_IMMUTABLE); update_inode = B_TRUE; } if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_NOUNLINK); } if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_APPENDONLY); update_inode = B_TRUE; } if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_NODUMP); } if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_OPAQUE); } if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { ZFS_ATTR_SET(zp, ZFS_AV_QUARANTINED, xoap->xoa_av_quarantined, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); } if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_AV_MODIFIED); } if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) { zfs_sa_set_scanstamp(zp, xvap, tx); XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP); } if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { ZFS_ATTR_SET(zp, ZFS_REPARSE, xoap->xoa_reparse, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_REPARSE); } if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) { ZFS_ATTR_SET(zp, ZFS_OFFLINE, xoap->xoa_offline, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_OFFLINE); } if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) { ZFS_ATTR_SET(zp, ZFS_SPARSE, xoap->xoa_sparse, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_SPARSE); } if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) { ZFS_ATTR_SET(zp, ZFS_PROJINHERIT, xoap->xoa_projinherit, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_PROJINHERIT); } if (update_inode) zfs_set_inode_flags(zp, ZTOI(zp)); } int zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp) { dmu_object_info_t doi; dmu_buf_t *db; znode_t *zp; znode_hold_t *zh; int err; sa_handle_t *hdl; *zpp = NULL; again: zh = zfs_znode_hold_enter(zfsvfs, obj_num); err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db); if (err) { zfs_znode_hold_exit(zfsvfs, zh); return (err); } dmu_object_info_from_db(db, &doi); if (doi.doi_bonus_type != DMU_OT_SA && (doi.doi_bonus_type != DMU_OT_ZNODE || (doi.doi_bonus_type == DMU_OT_ZNODE && doi.doi_bonus_size < sizeof (znode_phys_t)))) { sa_buf_rele(db, NULL); zfs_znode_hold_exit(zfsvfs, zh); return (SET_ERROR(EINVAL)); } hdl = dmu_buf_get_user(db); if (hdl != NULL) { zp = sa_get_userdata(hdl); /* * Since "SA" does immediate eviction we * should never find a sa handle that doesn't * know about the znode. */ ASSERT3P(zp, !=, NULL); mutex_enter(&zp->z_lock); ASSERT3U(zp->z_id, ==, obj_num); /* * If zp->z_unlinked is set, the znode is already marked * for deletion and should not be discovered. Check this * after checking igrab() due to fsetxattr() & O_TMPFILE. * * If igrab() returns NULL the VFS has independently * determined the inode should be evicted and has * called iput_final() to start the eviction process. * The SA handle is still valid but because the VFS * requires that the eviction succeed we must drop * our locks and references to allow the eviction to * complete. The zfs_zget() may then be retried. * * This unlikely case could be optimized by registering * a sops->drop_inode() callback. The callback would * need to detect the active SA hold thereby informing * the VFS that this inode should not be evicted. */ if (igrab(ZTOI(zp)) == NULL) { if (zp->z_unlinked) err = SET_ERROR(ENOENT); else err = SET_ERROR(EAGAIN); } else { *zpp = zp; err = 0; } mutex_exit(&zp->z_lock); sa_buf_rele(db, NULL); zfs_znode_hold_exit(zfsvfs, zh); if (err == EAGAIN) { /* inode might need this to finish evict */ cond_resched(); goto again; } return (err); } /* * Not found create new znode/vnode but only if file exists. * * There is a small window where zfs_vget() could * find this object while a file create is still in * progress. This is checked for in zfs_znode_alloc() * * if zfs_znode_alloc() fails it will drop the hold on the * bonus buffer. */ zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size, doi.doi_bonus_type, NULL); if (zp == NULL) { err = SET_ERROR(ENOENT); } else { *zpp = zp; } zfs_znode_hold_exit(zfsvfs, zh); return (err); } int zfs_rezget(znode_t *zp) { zfsvfs_t *zfsvfs = ZTOZSB(zp); dmu_object_info_t doi; dmu_buf_t *db; uint64_t obj_num = zp->z_id; uint64_t mode; uint64_t links; sa_bulk_attr_t bulk[11]; int err; int count = 0; uint64_t gen; uint64_t z_uid, z_gid; uint64_t atime[2], mtime[2], ctime[2], btime[2]; uint64_t projid = ZFS_DEFAULT_PROJID; znode_hold_t *zh; /* * skip ctldir, otherwise they will always get invalidated. This will * cause funny behaviour for the mounted snapdirs. Especially for * Linux >= 3.18, d_invalidate will detach the mountpoint and prevent * anyone automount it again as long as someone is still using the * detached mount. */ if (zp->z_is_ctldir) return (0); zh = zfs_znode_hold_enter(zfsvfs, obj_num); mutex_enter(&zp->z_acl_lock); if (zp->z_acl_cached) { zfs_acl_free(zp->z_acl_cached); zp->z_acl_cached = NULL; } mutex_exit(&zp->z_acl_lock); rw_enter(&zp->z_xattr_lock, RW_WRITER); if (zp->z_xattr_cached) { nvlist_free(zp->z_xattr_cached); zp->z_xattr_cached = NULL; } rw_exit(&zp->z_xattr_lock); ASSERT(zp->z_sa_hdl == NULL); err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db); if (err) { zfs_znode_hold_exit(zfsvfs, zh); return (err); } dmu_object_info_from_db(db, &doi); if (doi.doi_bonus_type != DMU_OT_SA && (doi.doi_bonus_type != DMU_OT_ZNODE || (doi.doi_bonus_type == DMU_OT_ZNODE && doi.doi_bonus_size < sizeof (znode_phys_t)))) { sa_buf_rele(db, NULL); zfs_znode_hold_exit(zfsvfs, zh); return (SET_ERROR(EINVAL)); } zfs_znode_sa_init(zfsvfs, zp, db, doi.doi_bonus_type, NULL); /* reload cached values */ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, &gen, sizeof (gen)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, &zp->z_size, sizeof (zp->z_size)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, &links, sizeof (links)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, sizeof (zp->z_pflags)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &z_uid, sizeof (z_uid)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &z_gid, sizeof (z_gid)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, sizeof (mode)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &btime, 16); if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) { zfs_znode_dmu_fini(zp); zfs_znode_hold_exit(zfsvfs, zh); return (SET_ERROR(EIO)); } if (dmu_objset_projectquota_enabled(zfsvfs->z_os)) { err = sa_lookup(zp->z_sa_hdl, SA_ZPL_PROJID(zfsvfs), &projid, 8); if (err != 0 && err != ENOENT) { zfs_znode_dmu_fini(zp); zfs_znode_hold_exit(zfsvfs, zh); return (SET_ERROR(err)); } } zp->z_projid = projid; zp->z_mode = ZTOI(zp)->i_mode = mode; zfs_uid_write(ZTOI(zp), z_uid); zfs_gid_write(ZTOI(zp), z_gid); ZFS_TIME_DECODE(&ZTOI(zp)->i_atime, atime); ZFS_TIME_DECODE(&ZTOI(zp)->i_mtime, mtime); ZFS_TIME_DECODE(&ZTOI(zp)->i_ctime, ctime); ZFS_TIME_DECODE(&zp->z_btime, btime); if ((uint32_t)gen != ZTOI(zp)->i_generation) { zfs_znode_dmu_fini(zp); zfs_znode_hold_exit(zfsvfs, zh); return (SET_ERROR(EIO)); } set_nlink(ZTOI(zp), (uint32_t)links); zfs_set_inode_flags(zp, ZTOI(zp)); zp->z_blksz = doi.doi_data_block_size; zp->z_atime_dirty = B_FALSE; zfs_znode_update_vfs(zp); /* * If the file has zero links, then it has been unlinked on the send * side and it must be in the received unlinked set. * We call zfs_znode_dmu_fini() now to prevent any accesses to the * stale data and to prevent automatic removal of the file in * zfs_zinactive(). The file will be removed either when it is removed * on the send side and the next incremental stream is received or * when the unlinked set gets processed. */ zp->z_unlinked = (ZTOI(zp)->i_nlink == 0); if (zp->z_unlinked) zfs_znode_dmu_fini(zp); zfs_znode_hold_exit(zfsvfs, zh); return (0); } void zfs_znode_delete(znode_t *zp, dmu_tx_t *tx) { zfsvfs_t *zfsvfs = ZTOZSB(zp); objset_t *os = zfsvfs->z_os; uint64_t obj = zp->z_id; uint64_t acl_obj = zfs_external_acl(zp); znode_hold_t *zh; zh = zfs_znode_hold_enter(zfsvfs, obj); if (acl_obj) { VERIFY(!zp->z_is_sa); VERIFY(0 == dmu_object_free(os, acl_obj, tx)); } VERIFY(0 == dmu_object_free(os, obj, tx)); zfs_znode_dmu_fini(zp); zfs_znode_hold_exit(zfsvfs, zh); } void zfs_zinactive(znode_t *zp) { zfsvfs_t *zfsvfs = ZTOZSB(zp); uint64_t z_id = zp->z_id; znode_hold_t *zh; ASSERT(zp->z_sa_hdl); /* * Don't allow a zfs_zget() while were trying to release this znode. */ zh = zfs_znode_hold_enter(zfsvfs, z_id); mutex_enter(&zp->z_lock); /* * If this was the last reference to a file with no links, remove * the file from the file system unless the file system is mounted * read-only. That can happen, for example, if the file system was * originally read-write, the file was opened, then unlinked and * the file system was made read-only before the file was finally * closed. The file will remain in the unlinked set. */ if (zp->z_unlinked) { ASSERT(!zfsvfs->z_issnap); if (!zfs_is_readonly(zfsvfs) && !zfs_unlink_suspend_progress) { mutex_exit(&zp->z_lock); zfs_znode_hold_exit(zfsvfs, zh); zfs_rmnode(zp); return; } } mutex_exit(&zp->z_lock); zfs_znode_dmu_fini(zp); zfs_znode_hold_exit(zfsvfs, zh); } #if defined(HAVE_INODE_TIMESPEC64_TIMES) #define zfs_compare_timespec timespec64_compare #else #define zfs_compare_timespec timespec_compare #endif /* * Determine whether the znode's atime must be updated. The logic mostly * duplicates the Linux kernel's relatime_need_update() functionality. * This function is only called if the underlying filesystem actually has * atime updates enabled. */ boolean_t zfs_relatime_need_update(const struct inode *ip) { inode_timespec_t now; gethrestime(&now); /* * In relatime mode, only update the atime if the previous atime * is earlier than either the ctime or mtime or if at least a day * has passed since the last update of atime. */ if (zfs_compare_timespec(&ip->i_mtime, &ip->i_atime) >= 0) return (B_TRUE); if (zfs_compare_timespec(&ip->i_ctime, &ip->i_atime) >= 0) return (B_TRUE); if ((hrtime_t)now.tv_sec - (hrtime_t)ip->i_atime.tv_sec >= 24*60*60) return (B_TRUE); return (B_FALSE); } /* * Prepare to update znode time stamps. * * IN: zp - znode requiring timestamp update * flag - ATTR_MTIME, ATTR_CTIME flags * * OUT: zp - z_seq * mtime - new mtime * ctime - new ctime * * Note: We don't update atime here, because we rely on Linux VFS to do * atime updating. */ void zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2], uint64_t ctime[2]) { inode_timespec_t now; gethrestime(&now); zp->z_seq++; if (flag & ATTR_MTIME) { ZFS_TIME_ENCODE(&now, mtime); ZFS_TIME_DECODE(&(ZTOI(zp)->i_mtime), mtime); if (ZTOZSB(zp)->z_use_fuids) { zp->z_pflags |= (ZFS_ARCHIVE | ZFS_AV_MODIFIED); } } if (flag & ATTR_CTIME) { ZFS_TIME_ENCODE(&now, ctime); ZFS_TIME_DECODE(&(ZTOI(zp)->i_ctime), ctime); if (ZTOZSB(zp)->z_use_fuids) zp->z_pflags |= ZFS_ARCHIVE; } } /* * Grow the block size for a file. * * IN: zp - znode of file to free data in. * size - requested block size * tx - open transaction. * * NOTE: this function assumes that the znode is write locked. */ void zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx) { int error; u_longlong_t dummy; if (size <= zp->z_blksz) return; /* * If the file size is already greater than the current blocksize, * we will not grow. If there is more than one block in a file, * the blocksize cannot change. */ if (zp->z_blksz && zp->z_size > zp->z_blksz) return; error = dmu_object_set_blocksize(ZTOZSB(zp)->z_os, zp->z_id, size, 0, tx); if (error == ENOTSUP) return; ASSERT0(error); /* What blocksize did we actually get? */ dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &zp->z_blksz, &dummy); } /* * Increase the file length * * IN: zp - znode of file to free data in. * end - new end-of-file * * RETURN: 0 on success, error code on failure */ static int zfs_extend(znode_t *zp, uint64_t end) { zfsvfs_t *zfsvfs = ZTOZSB(zp); dmu_tx_t *tx; zfs_locked_range_t *lr; uint64_t newblksz; int error; /* * We will change zp_size, lock the whole file. */ lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER); /* * Nothing to do if file already at desired length. */ if (end <= zp->z_size) { zfs_rangelock_exit(lr); return (0); } tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); if (end > zp->z_blksz && (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) { /* * We are growing the file past the current block size. */ if (zp->z_blksz > ZTOZSB(zp)->z_max_blksz) { /* * File's blocksize is already larger than the * "recordsize" property. Only let it grow to * the next power of 2. */ ASSERT(!ISP2(zp->z_blksz)); newblksz = MIN(end, 1 << highbit64(zp->z_blksz)); } else { newblksz = MIN(end, ZTOZSB(zp)->z_max_blksz); } dmu_tx_hold_write(tx, zp->z_id, 0, newblksz); } else { newblksz = 0; } error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); zfs_rangelock_exit(lr); return (error); } if (newblksz) zfs_grow_blocksize(zp, newblksz, tx); zp->z_size = end; VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(ZTOZSB(zp)), &zp->z_size, sizeof (zp->z_size), tx)); zfs_rangelock_exit(lr); dmu_tx_commit(tx); return (0); } /* * zfs_zero_partial_page - Modeled after update_pages() but * with different arguments and semantics for use by zfs_freesp(). * * Zeroes a piece of a single page cache entry for zp at offset * start and length len. * * Caller must acquire a range lock on the file for the region * being zeroed in order that the ARC and page cache stay in sync. */ static void zfs_zero_partial_page(znode_t *zp, uint64_t start, uint64_t len) { struct address_space *mp = ZTOI(zp)->i_mapping; struct page *pp; int64_t off; void *pb; ASSERT((start & PAGE_MASK) == ((start + len - 1) & PAGE_MASK)); off = start & (PAGE_SIZE - 1); start &= PAGE_MASK; pp = find_lock_page(mp, start >> PAGE_SHIFT); if (pp) { if (mapping_writably_mapped(mp)) flush_dcache_page(pp); pb = kmap(pp); memset(pb + off, 0, len); kunmap(pp); if (mapping_writably_mapped(mp)) flush_dcache_page(pp); mark_page_accessed(pp); SetPageUptodate(pp); ClearPageError(pp); unlock_page(pp); put_page(pp); } } /* * Free space in a file. * * IN: zp - znode of file to free data in. * off - start of section to free. * len - length of section to free. * * RETURN: 0 on success, error code on failure */ static int zfs_free_range(znode_t *zp, uint64_t off, uint64_t len) { zfsvfs_t *zfsvfs = ZTOZSB(zp); zfs_locked_range_t *lr; int error; /* * Lock the range being freed. */ lr = zfs_rangelock_enter(&zp->z_rangelock, off, len, RL_WRITER); /* * Nothing to do if file already at desired length. */ if (off >= zp->z_size) { zfs_rangelock_exit(lr); return (0); } if (off + len > zp->z_size) len = zp->z_size - off; error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len); /* * Zero partial page cache entries. This must be done under a * range lock in order to keep the ARC and page cache in sync. */ if (zp->z_is_mapped) { loff_t first_page, last_page, page_len; loff_t first_page_offset, last_page_offset; /* first possible full page in hole */ first_page = (off + PAGE_SIZE - 1) >> PAGE_SHIFT; /* last page of hole */ last_page = (off + len) >> PAGE_SHIFT; /* offset of first_page */ first_page_offset = first_page << PAGE_SHIFT; /* offset of last_page */ last_page_offset = last_page << PAGE_SHIFT; /* truncate whole pages */ if (last_page_offset > first_page_offset) { truncate_inode_pages_range(ZTOI(zp)->i_mapping, first_page_offset, last_page_offset - 1); } /* truncate sub-page ranges */ if (first_page > last_page) { /* entire punched area within a single page */ zfs_zero_partial_page(zp, off, len); } else { /* beginning of punched area at the end of a page */ page_len = first_page_offset - off; if (page_len > 0) zfs_zero_partial_page(zp, off, page_len); /* end of punched area at the beginning of a page */ page_len = off + len - last_page_offset; if (page_len > 0) zfs_zero_partial_page(zp, last_page_offset, page_len); } } zfs_rangelock_exit(lr); return (error); } /* * Truncate a file * * IN: zp - znode of file to free data in. * end - new end-of-file. * * RETURN: 0 on success, error code on failure */ static int zfs_trunc(znode_t *zp, uint64_t end) { zfsvfs_t *zfsvfs = ZTOZSB(zp); dmu_tx_t *tx; zfs_locked_range_t *lr; int error; sa_bulk_attr_t bulk[2]; int count = 0; /* * We will change zp_size, lock the whole file. */ lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER); /* * Nothing to do if file already at desired length. */ if (end >= zp->z_size) { zfs_rangelock_exit(lr); return (0); } error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end, DMU_OBJECT_END); if (error) { zfs_rangelock_exit(lr); return (error); } tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); dmu_tx_mark_netfree(tx); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); zfs_rangelock_exit(lr); return (error); } zp->z_size = end; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, &zp->z_size, sizeof (zp->z_size)); if (end == 0) { zp->z_pflags &= ~ZFS_SPARSE; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, 8); } VERIFY(sa_bulk_update(zp->z_sa_hdl, bulk, count, tx) == 0); dmu_tx_commit(tx); zfs_rangelock_exit(lr); return (0); } /* * Free space in a file * * IN: zp - znode of file to free data in. * off - start of range * len - end of range (0 => EOF) * flag - current file open mode flags. * log - TRUE if this action should be logged * * RETURN: 0 on success, error code on failure */ int zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log) { dmu_tx_t *tx; zfsvfs_t *zfsvfs = ZTOZSB(zp); zilog_t *zilog = zfsvfs->z_log; uint64_t mode; uint64_t mtime[2], ctime[2]; sa_bulk_attr_t bulk[3]; int count = 0; int error; if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), &mode, sizeof (mode))) != 0) return (error); if (off > zp->z_size) { error = zfs_extend(zp, off+len); if (error == 0 && log) goto log; goto out; } if (len == 0) { error = zfs_trunc(zp, off); } else { if ((error = zfs_free_range(zp, off, len)) == 0 && off + len > zp->z_size) error = zfs_extend(zp, off+len); } if (error || !log) goto out; log: tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); goto out; } SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, 8); zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime); error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); ASSERT(error == 0); zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len); dmu_tx_commit(tx); zfs_znode_update_vfs(zp); error = 0; out: /* * Truncate the page cache - for file truncate operations, use * the purpose-built API for truncations. For punching operations, * the truncation is handled under a range lock in zfs_free_range. */ if (len == 0) truncate_setsize(ZTOI(zp), off); return (error); } void zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) { struct super_block *sb; zfsvfs_t *zfsvfs; uint64_t moid, obj, sa_obj, version; uint64_t sense = ZFS_CASE_SENSITIVE; uint64_t norm = 0; nvpair_t *elem; int size; int error; int i; znode_t *rootzp = NULL; vattr_t vattr; znode_t *zp; zfs_acl_ids_t acl_ids; /* * First attempt to create master node. */ /* * In an empty objset, there are no blocks to read and thus * there can be no i/o errors (which we assert below). */ moid = MASTER_NODE_OBJ; error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE, DMU_OT_NONE, 0, tx); ASSERT(error == 0); /* * Set starting attributes. */ version = zfs_zpl_version_map(spa_version(dmu_objset_spa(os))); elem = NULL; while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) { /* For the moment we expect all zpl props to be uint64_ts */ uint64_t val; char *name; ASSERT(nvpair_type(elem) == DATA_TYPE_UINT64); VERIFY(nvpair_value_uint64(elem, &val) == 0); name = nvpair_name(elem); if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) { if (val < version) version = val; } else { error = zap_update(os, moid, name, 8, 1, &val, tx); } ASSERT(error == 0); if (strcmp(name, zfs_prop_to_name(ZFS_PROP_NORMALIZE)) == 0) norm = val; else if (strcmp(name, zfs_prop_to_name(ZFS_PROP_CASE)) == 0) sense = val; } ASSERT(version != 0); error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx); /* * Create zap object used for SA attribute registration */ if (version >= ZPL_VERSION_SA) { sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE, DMU_OT_NONE, 0, tx); error = zap_add(os, moid, ZFS_SA_ATTRS, 8, 1, &sa_obj, tx); ASSERT(error == 0); } else { sa_obj = 0; } /* * Create a delete queue. */ obj = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx); error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &obj, tx); ASSERT(error == 0); /* * Create root znode. Create minimal znode/inode/zfsvfs/sb * to allow zfs_mknode to work. */ vattr.va_mask = ATTR_MODE|ATTR_UID|ATTR_GID; vattr.va_mode = S_IFDIR|0755; vattr.va_uid = crgetuid(cr); vattr.va_gid = crgetgid(cr); rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP); rootzp->z_unlinked = B_FALSE; rootzp->z_atime_dirty = B_FALSE; rootzp->z_is_sa = USE_SA(version, os); rootzp->z_pflags = 0; zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); zfsvfs->z_os = os; zfsvfs->z_parent = zfsvfs; zfsvfs->z_version = version; zfsvfs->z_use_fuids = USE_FUIDS(version, os); zfsvfs->z_use_sa = USE_SA(version, os); zfsvfs->z_norm = norm; sb = kmem_zalloc(sizeof (struct super_block), KM_SLEEP); sb->s_fs_info = zfsvfs; ZTOI(rootzp)->i_sb = sb; error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END, &zfsvfs->z_attr_table); ASSERT(error == 0); /* * Fold case on file systems that are always or sometimes case * insensitive. */ if (sense == ZFS_CASE_INSENSITIVE || sense == ZFS_CASE_MIXED) zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER; mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), offsetof(znode_t, z_link_node)); size = MIN(1 << (highbit64(zfs_object_mutex_size)-1), ZFS_OBJ_MTX_MAX); zfsvfs->z_hold_size = size; zfsvfs->z_hold_trees = vmem_zalloc(sizeof (avl_tree_t) * size, KM_SLEEP); zfsvfs->z_hold_locks = vmem_zalloc(sizeof (kmutex_t) * size, KM_SLEEP); for (i = 0; i != size; i++) { avl_create(&zfsvfs->z_hold_trees[i], zfs_znode_hold_compare, sizeof (znode_hold_t), offsetof(znode_hold_t, zh_node)); mutex_init(&zfsvfs->z_hold_locks[i], NULL, MUTEX_DEFAULT, NULL); } VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr, cr, NULL, &acl_ids)); zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, &acl_ids); ASSERT3P(zp, ==, rootzp); error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx); ASSERT(error == 0); zfs_acl_ids_free(&acl_ids); atomic_set(&ZTOI(rootzp)->i_count, 0); sa_handle_destroy(rootzp->z_sa_hdl); kmem_cache_free(znode_cache, rootzp); for (i = 0; i != size; i++) { avl_destroy(&zfsvfs->z_hold_trees[i]); mutex_destroy(&zfsvfs->z_hold_locks[i]); } mutex_destroy(&zfsvfs->z_znodes_lock); vmem_free(zfsvfs->z_hold_trees, sizeof (avl_tree_t) * size); vmem_free(zfsvfs->z_hold_locks, sizeof (kmutex_t) * size); kmem_free(sb, sizeof (struct super_block)); kmem_free(zfsvfs, sizeof (zfsvfs_t)); } #endif /* _KERNEL */ static int zfs_sa_setup(objset_t *osp, sa_attr_type_t **sa_table) { uint64_t sa_obj = 0; int error; error = zap_lookup(osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj); if (error != 0 && error != ENOENT) return (error); error = sa_setup(osp, sa_obj, zfs_attr_table, ZPL_END, sa_table); return (error); } static int zfs_grab_sa_handle(objset_t *osp, uint64_t obj, sa_handle_t **hdlp, dmu_buf_t **db, const void *tag) { dmu_object_info_t doi; int error; if ((error = sa_buf_hold(osp, obj, tag, db)) != 0) return (error); dmu_object_info_from_db(*db, &doi); if ((doi.doi_bonus_type != DMU_OT_SA && doi.doi_bonus_type != DMU_OT_ZNODE) || (doi.doi_bonus_type == DMU_OT_ZNODE && doi.doi_bonus_size < sizeof (znode_phys_t))) { sa_buf_rele(*db, tag); return (SET_ERROR(ENOTSUP)); } error = sa_handle_get(osp, obj, NULL, SA_HDL_PRIVATE, hdlp); if (error != 0) { sa_buf_rele(*db, tag); return (error); } return (0); } static void zfs_release_sa_handle(sa_handle_t *hdl, dmu_buf_t *db, const void *tag) { sa_handle_destroy(hdl); sa_buf_rele(db, tag); } /* * Given an object number, return its parent object number and whether * or not the object is an extended attribute directory. */ static int zfs_obj_to_pobj(objset_t *osp, sa_handle_t *hdl, sa_attr_type_t *sa_table, uint64_t *pobjp, int *is_xattrdir) { uint64_t parent; uint64_t pflags; uint64_t mode; uint64_t parent_mode; sa_bulk_attr_t bulk[3]; sa_handle_t *sa_hdl; dmu_buf_t *sa_db; int count = 0; int error; SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_PARENT], NULL, &parent, sizeof (parent)); SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_FLAGS], NULL, &pflags, sizeof (pflags)); SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL, &mode, sizeof (mode)); if ((error = sa_bulk_lookup(hdl, bulk, count)) != 0) return (error); /* * When a link is removed its parent pointer is not changed and will * be invalid. There are two cases where a link is removed but the * file stays around, when it goes to the delete queue and when there * are additional links. */ error = zfs_grab_sa_handle(osp, parent, &sa_hdl, &sa_db, FTAG); if (error != 0) return (error); error = sa_lookup(sa_hdl, ZPL_MODE, &parent_mode, sizeof (parent_mode)); zfs_release_sa_handle(sa_hdl, sa_db, FTAG); if (error != 0) return (error); *is_xattrdir = ((pflags & ZFS_XATTR) != 0) && S_ISDIR(mode); /* * Extended attributes can be applied to files, directories, etc. * Otherwise the parent must be a directory. */ if (!*is_xattrdir && !S_ISDIR(parent_mode)) return (SET_ERROR(EINVAL)); *pobjp = parent; return (0); } /* * Given an object number, return some zpl level statistics */ static int zfs_obj_to_stats_impl(sa_handle_t *hdl, sa_attr_type_t *sa_table, zfs_stat_t *sb) { sa_bulk_attr_t bulk[4]; int count = 0; SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL, &sb->zs_mode, sizeof (sb->zs_mode)); SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_GEN], NULL, &sb->zs_gen, sizeof (sb->zs_gen)); SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_LINKS], NULL, &sb->zs_links, sizeof (sb->zs_links)); SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_CTIME], NULL, &sb->zs_ctime, sizeof (sb->zs_ctime)); return (sa_bulk_lookup(hdl, bulk, count)); } static int zfs_obj_to_path_impl(objset_t *osp, uint64_t obj, sa_handle_t *hdl, sa_attr_type_t *sa_table, char *buf, int len) { sa_handle_t *sa_hdl; sa_handle_t *prevhdl = NULL; dmu_buf_t *prevdb = NULL; dmu_buf_t *sa_db = NULL; char *path = buf + len - 1; int error; *path = '\0'; sa_hdl = hdl; uint64_t deleteq_obj; VERIFY0(zap_lookup(osp, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, sizeof (uint64_t), 1, &deleteq_obj)); error = zap_lookup_int(osp, deleteq_obj, obj); if (error == 0) { return (ESTALE); } else if (error != ENOENT) { return (error); } - error = 0; for (;;) { uint64_t pobj = 0; char component[MAXNAMELEN + 2]; size_t complen; int is_xattrdir = 0; if (prevdb) { ASSERT(prevhdl != NULL); zfs_release_sa_handle(prevhdl, prevdb, FTAG); } if ((error = zfs_obj_to_pobj(osp, sa_hdl, sa_table, &pobj, &is_xattrdir)) != 0) break; if (pobj == obj) { if (path[0] != '/') *--path = '/'; break; } component[0] = '/'; if (is_xattrdir) { strcpy(component + 1, ""); } else { error = zap_value_search(osp, pobj, obj, ZFS_DIRENT_OBJ(-1ULL), component + 1); if (error != 0) break; } complen = strlen(component); path -= complen; ASSERT(path >= buf); memcpy(path, component, complen); obj = pobj; if (sa_hdl != hdl) { prevhdl = sa_hdl; prevdb = sa_db; } error = zfs_grab_sa_handle(osp, obj, &sa_hdl, &sa_db, FTAG); if (error != 0) { sa_hdl = prevhdl; sa_db = prevdb; break; } } if (sa_hdl != NULL && sa_hdl != hdl) { ASSERT(sa_db != NULL); zfs_release_sa_handle(sa_hdl, sa_db, FTAG); } if (error == 0) (void) memmove(buf, path, buf + len - path); return (error); } int zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len) { sa_attr_type_t *sa_table; sa_handle_t *hdl; dmu_buf_t *db; int error; error = zfs_sa_setup(osp, &sa_table); if (error != 0) return (error); error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG); if (error != 0) return (error); error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len); zfs_release_sa_handle(hdl, db, FTAG); return (error); } int zfs_obj_to_stats(objset_t *osp, uint64_t obj, zfs_stat_t *sb, char *buf, int len) { char *path = buf + len - 1; sa_attr_type_t *sa_table; sa_handle_t *hdl; dmu_buf_t *db; int error; *path = '\0'; error = zfs_sa_setup(osp, &sa_table); if (error != 0) return (error); error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG); if (error != 0) return (error); error = zfs_obj_to_stats_impl(hdl, sa_table, sb); if (error != 0) { zfs_release_sa_handle(hdl, db, FTAG); return (error); } error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len); zfs_release_sa_handle(hdl, db, FTAG); return (error); } #if defined(_KERNEL) EXPORT_SYMBOL(zfs_create_fs); EXPORT_SYMBOL(zfs_obj_to_path); /* CSTYLED */ module_param(zfs_object_mutex_size, uint, 0644); MODULE_PARM_DESC(zfs_object_mutex_size, "Size of znode hold array"); module_param(zfs_unlink_suspend_progress, int, 0644); MODULE_PARM_DESC(zfs_unlink_suspend_progress, "Set to prevent async unlinks " "(debug - leaks space into the unlinked set)"); #endif diff --git a/module/os/linux/zfs/zio_crypt.c b/module/os/linux/zfs/zio_crypt.c index 2bc1482e91ec..6f2bf7ed7569 100644 --- a/module/os/linux/zfs/zio_crypt.c +++ b/module/os/linux/zfs/zio_crypt.c @@ -1,2048 +1,2047 @@ /* * CDDL HEADER START * * This file and its contents are supplied under the terms of the * Common Development and Distribution License ("CDDL"), version 1.0. * You may only use this file in accordance with the terms of version * 1.0 of the CDDL. * * A full copy of the text of the CDDL should have accompanied this * source. A copy of the CDDL is also available via the Internet at * http://www.illumos.org/license/CDDL. * * CDDL HEADER END */ /* * Copyright (c) 2017, Datto, Inc. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include /* * This file is responsible for handling all of the details of generating * encryption parameters and performing encryption and authentication. * * BLOCK ENCRYPTION PARAMETERS: * Encryption /Authentication Algorithm Suite (crypt): * The encryption algorithm, mode, and key length we are going to use. We * currently support AES in either GCM or CCM modes with 128, 192, and 256 bit * keys. All authentication is currently done with SHA512-HMAC. * * Plaintext: * The unencrypted data that we want to encrypt. * * Initialization Vector (IV): * An initialization vector for the encryption algorithms. This is used to * "tweak" the encryption algorithms so that two blocks of the same data are * encrypted into different ciphertext outputs, thus obfuscating block patterns. * The supported encryption modes (AES-GCM and AES-CCM) require that an IV is * never reused with the same encryption key. This value is stored unencrypted * and must simply be provided to the decryption function. We use a 96 bit IV * (as recommended by NIST) for all block encryption. For non-dedup blocks we * derive the IV randomly. The first 64 bits of the IV are stored in the second * word of DVA[2] and the remaining 32 bits are stored in the upper 32 bits of * blk_fill. This is safe because encrypted blocks can't use the upper 32 bits * of blk_fill. We only encrypt level 0 blocks, which normally have a fill count * of 1. The only exception is for DMU_OT_DNODE objects, where the fill count of * level 0 blocks is the number of allocated dnodes in that block. The on-disk * format supports at most 2^15 slots per L0 dnode block, because the maximum * block size is 16MB (2^24). In either case, for level 0 blocks this number * will still be smaller than UINT32_MAX so it is safe to store the IV in the * top 32 bits of blk_fill, while leaving the bottom 32 bits of the fill count * for the dnode code. * * Master key: * This is the most important secret data of an encrypted dataset. It is used * along with the salt to generate that actual encryption keys via HKDF. We * do not use the master key to directly encrypt any data because there are * theoretical limits on how much data can actually be safely encrypted with * any encryption mode. The master key is stored encrypted on disk with the * user's wrapping key. Its length is determined by the encryption algorithm. * For details on how this is stored see the block comment in dsl_crypt.c * * Salt: * Used as an input to the HKDF function, along with the master key. We use a * 64 bit salt, stored unencrypted in the first word of DVA[2]. Any given salt * can be used for encrypting many blocks, so we cache the current salt and the * associated derived key in zio_crypt_t so we do not need to derive it again * needlessly. * * Encryption Key: * A secret binary key, generated from an HKDF function used to encrypt and * decrypt data. * * Message Authentication Code (MAC) * The MAC is an output of authenticated encryption modes such as AES-GCM and * AES-CCM. Its purpose is to ensure that an attacker cannot modify encrypted * data on disk and return garbage to the application. Effectively, it is a * checksum that can not be reproduced by an attacker. We store the MAC in the * second 128 bits of blk_cksum, leaving the first 128 bits for a truncated * regular checksum of the ciphertext which can be used for scrubbing. * * OBJECT AUTHENTICATION: * Some object types, such as DMU_OT_MASTER_NODE cannot be encrypted because * they contain some info that always needs to be readable. To prevent this * data from being altered, we authenticate this data using SHA512-HMAC. This * will produce a MAC (similar to the one produced via encryption) which can * be used to verify the object was not modified. HMACs do not require key * rotation or IVs, so we can keep up to the full 3 copies of authenticated * data. * * ZIL ENCRYPTION: * ZIL blocks have their bp written to disk ahead of the associated data, so we * cannot store the MAC there as we normally do. For these blocks the MAC is * stored in the embedded checksum within the zil_chain_t header. The salt and * IV are generated for the block on bp allocation instead of at encryption * time. In addition, ZIL blocks have some pieces that must be left in plaintext * for claiming even though all of the sensitive user data still needs to be * encrypted. The function zio_crypt_init_uios_zil() handles parsing which * pieces of the block need to be encrypted. All data that is not encrypted is * authenticated using the AAD mechanisms that the supported encryption modes * provide for. In order to preserve the semantics of the ZIL for encrypted * datasets, the ZIL is not protected at the objset level as described below. * * DNODE ENCRYPTION: * Similarly to ZIL blocks, the core part of each dnode_phys_t needs to be left * in plaintext for scrubbing and claiming, but the bonus buffers might contain * sensitive user data. The function zio_crypt_init_uios_dnode() handles parsing * which pieces of the block need to be encrypted. For more details about * dnode authentication and encryption, see zio_crypt_init_uios_dnode(). * * OBJECT SET AUTHENTICATION: * Up to this point, everything we have encrypted and authenticated has been * at level 0 (or -2 for the ZIL). If we did not do any further work the * on-disk format would be susceptible to attacks that deleted or rearranged * the order of level 0 blocks. Ideally, the cleanest solution would be to * maintain a tree of authentication MACs going up the bp tree. However, this * presents a problem for raw sends. Send files do not send information about * indirect blocks so there would be no convenient way to transfer the MACs and * they cannot be recalculated on the receive side without the master key which * would defeat one of the purposes of raw sends in the first place. Instead, * for the indirect levels of the bp tree, we use a regular SHA512 of the MACs * from the level below. We also include some portable fields from blk_prop such * as the lsize and compression algorithm to prevent the data from being * misinterpreted. * * At the objset level, we maintain 2 separate 256 bit MACs in the * objset_phys_t. The first one is "portable" and is the logical root of the * MAC tree maintained in the metadnode's bps. The second, is "local" and is * used as the root MAC for the user accounting objects, which are also not * transferred via "zfs send". The portable MAC is sent in the DRR_BEGIN payload * of the send file. The useraccounting code ensures that the useraccounting * info is not present upon a receive, so the local MAC can simply be cleared * out at that time. For more info about objset_phys_t authentication, see * zio_crypt_do_objset_hmacs(). * * CONSIDERATIONS FOR DEDUP: * In order for dedup to work, blocks that we want to dedup with one another * need to use the same IV and encryption key, so that they will have the same * ciphertext. Normally, one should never reuse an IV with the same encryption * key or else AES-GCM and AES-CCM can both actually leak the plaintext of both * blocks. In this case, however, since we are using the same plaintext as * well all that we end up with is a duplicate of the original ciphertext we * already had. As a result, an attacker with read access to the raw disk will * be able to tell which blocks are the same but this information is given away * by dedup anyway. In order to get the same IVs and encryption keys for * equivalent blocks of data we use an HMAC of the plaintext. We use an HMAC * here so that a reproducible checksum of the plaintext is never available to * the attacker. The HMAC key is kept alongside the master key, encrypted on * disk. The first 64 bits of the HMAC are used in place of the random salt, and * the next 96 bits are used as the IV. As a result of this mechanism, dedup * will only work within a clone family since encrypted dedup requires use of * the same master and HMAC keys. */ /* * After encrypting many blocks with the same key we may start to run up * against the theoretical limits of how much data can securely be encrypted * with a single key using the supported encryption modes. The most obvious * limitation is that our risk of generating 2 equivalent 96 bit IVs increases * the more IVs we generate (which both GCM and CCM modes strictly forbid). * This risk actually grows surprisingly quickly over time according to the * Birthday Problem. With a total IV space of 2^(96 bits), and assuming we have * generated n IVs with a cryptographically secure RNG, the approximate * probability p(n) of a collision is given as: * * p(n) ~= e^(-n*(n-1)/(2*(2^96))) * * [http://www.math.cornell.edu/~mec/2008-2009/TianyiZheng/Birthday.html] * * Assuming that we want to ensure that p(n) never goes over 1 / 1 trillion * we must not write more than 398,065,730 blocks with the same encryption key. * Therefore, we rotate our keys after 400,000,000 blocks have been written by * generating a new random 64 bit salt for our HKDF encryption key generation * function. */ #define ZFS_KEY_MAX_SALT_USES_DEFAULT 400000000 #define ZFS_CURRENT_MAX_SALT_USES \ (MIN(zfs_key_max_salt_uses, ZFS_KEY_MAX_SALT_USES_DEFAULT)) static unsigned long zfs_key_max_salt_uses = ZFS_KEY_MAX_SALT_USES_DEFAULT; typedef struct blkptr_auth_buf { uint64_t bab_prop; /* blk_prop - portable mask */ uint8_t bab_mac[ZIO_DATA_MAC_LEN]; /* MAC from blk_cksum */ uint64_t bab_pad; /* reserved for future use */ } blkptr_auth_buf_t; const zio_crypt_info_t zio_crypt_table[ZIO_CRYPT_FUNCTIONS] = { {"", ZC_TYPE_NONE, 0, "inherit"}, {"", ZC_TYPE_NONE, 0, "on"}, {"", ZC_TYPE_NONE, 0, "off"}, {SUN_CKM_AES_CCM, ZC_TYPE_CCM, 16, "aes-128-ccm"}, {SUN_CKM_AES_CCM, ZC_TYPE_CCM, 24, "aes-192-ccm"}, {SUN_CKM_AES_CCM, ZC_TYPE_CCM, 32, "aes-256-ccm"}, {SUN_CKM_AES_GCM, ZC_TYPE_GCM, 16, "aes-128-gcm"}, {SUN_CKM_AES_GCM, ZC_TYPE_GCM, 24, "aes-192-gcm"}, {SUN_CKM_AES_GCM, ZC_TYPE_GCM, 32, "aes-256-gcm"} }; void zio_crypt_key_destroy(zio_crypt_key_t *key) { rw_destroy(&key->zk_salt_lock); /* free crypto templates */ crypto_destroy_ctx_template(key->zk_current_tmpl); crypto_destroy_ctx_template(key->zk_hmac_tmpl); /* zero out sensitive data */ memset(key, 0, sizeof (zio_crypt_key_t)); } int zio_crypt_key_init(uint64_t crypt, zio_crypt_key_t *key) { int ret; crypto_mechanism_t mech; uint_t keydata_len; ASSERT(key != NULL); ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS); keydata_len = zio_crypt_table[crypt].ci_keylen; memset(key, 0, sizeof (zio_crypt_key_t)); rw_init(&key->zk_salt_lock, NULL, RW_DEFAULT, NULL); /* fill keydata buffers and salt with random data */ ret = random_get_bytes((uint8_t *)&key->zk_guid, sizeof (uint64_t)); if (ret != 0) goto error; ret = random_get_bytes(key->zk_master_keydata, keydata_len); if (ret != 0) goto error; ret = random_get_bytes(key->zk_hmac_keydata, SHA512_HMAC_KEYLEN); if (ret != 0) goto error; ret = random_get_bytes(key->zk_salt, ZIO_DATA_SALT_LEN); if (ret != 0) goto error; /* derive the current key from the master key */ ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0, key->zk_salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata, keydata_len); if (ret != 0) goto error; /* initialize keys for the ICP */ key->zk_current_key.ck_data = key->zk_current_keydata; key->zk_current_key.ck_length = CRYPTO_BYTES2BITS(keydata_len); key->zk_hmac_key.ck_data = &key->zk_hmac_key; key->zk_hmac_key.ck_length = CRYPTO_BYTES2BITS(SHA512_HMAC_KEYLEN); /* * Initialize the crypto templates. It's ok if this fails because * this is just an optimization. */ mech.cm_type = crypto_mech2id(zio_crypt_table[crypt].ci_mechname); ret = crypto_create_ctx_template(&mech, &key->zk_current_key, &key->zk_current_tmpl); if (ret != CRYPTO_SUCCESS) key->zk_current_tmpl = NULL; mech.cm_type = crypto_mech2id(SUN_CKM_SHA512_HMAC); ret = crypto_create_ctx_template(&mech, &key->zk_hmac_key, &key->zk_hmac_tmpl); if (ret != CRYPTO_SUCCESS) key->zk_hmac_tmpl = NULL; key->zk_crypt = crypt; key->zk_version = ZIO_CRYPT_KEY_CURRENT_VERSION; key->zk_salt_count = 0; return (0); error: zio_crypt_key_destroy(key); return (ret); } static int zio_crypt_key_change_salt(zio_crypt_key_t *key) { int ret = 0; uint8_t salt[ZIO_DATA_SALT_LEN]; crypto_mechanism_t mech; uint_t keydata_len = zio_crypt_table[key->zk_crypt].ci_keylen; /* generate a new salt */ ret = random_get_bytes(salt, ZIO_DATA_SALT_LEN); if (ret != 0) goto error; rw_enter(&key->zk_salt_lock, RW_WRITER); /* someone beat us to the salt rotation, just unlock and return */ if (key->zk_salt_count < ZFS_CURRENT_MAX_SALT_USES) goto out_unlock; /* derive the current key from the master key and the new salt */ ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0, salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata, keydata_len); if (ret != 0) goto out_unlock; /* assign the salt and reset the usage count */ memcpy(key->zk_salt, salt, ZIO_DATA_SALT_LEN); key->zk_salt_count = 0; /* destroy the old context template and create the new one */ crypto_destroy_ctx_template(key->zk_current_tmpl); ret = crypto_create_ctx_template(&mech, &key->zk_current_key, &key->zk_current_tmpl); if (ret != CRYPTO_SUCCESS) key->zk_current_tmpl = NULL; rw_exit(&key->zk_salt_lock); return (0); out_unlock: rw_exit(&key->zk_salt_lock); error: return (ret); } /* See comment above zfs_key_max_salt_uses definition for details */ int zio_crypt_key_get_salt(zio_crypt_key_t *key, uint8_t *salt) { int ret; boolean_t salt_change; rw_enter(&key->zk_salt_lock, RW_READER); memcpy(salt, key->zk_salt, ZIO_DATA_SALT_LEN); salt_change = (atomic_inc_64_nv(&key->zk_salt_count) >= ZFS_CURRENT_MAX_SALT_USES); rw_exit(&key->zk_salt_lock); if (salt_change) { ret = zio_crypt_key_change_salt(key); if (ret != 0) goto error; } return (0); error: return (ret); } /* * This function handles all encryption and decryption in zfs. When * encrypting it expects puio to reference the plaintext and cuio to * reference the ciphertext. cuio must have enough space for the * ciphertext + room for a MAC. datalen should be the length of the * plaintext / ciphertext alone. */ static int zio_do_crypt_uio(boolean_t encrypt, uint64_t crypt, crypto_key_t *key, crypto_ctx_template_t tmpl, uint8_t *ivbuf, uint_t datalen, zfs_uio_t *puio, zfs_uio_t *cuio, uint8_t *authbuf, uint_t auth_len) { int ret; crypto_data_t plaindata, cipherdata; CK_AES_CCM_PARAMS ccmp; CK_AES_GCM_PARAMS gcmp; crypto_mechanism_t mech; zio_crypt_info_t crypt_info; uint_t plain_full_len, maclen; ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS); /* lookup the encryption info */ crypt_info = zio_crypt_table[crypt]; /* the mac will always be the last iovec_t in the cipher uio */ maclen = cuio->uio_iov[cuio->uio_iovcnt - 1].iov_len; ASSERT(maclen <= ZIO_DATA_MAC_LEN); /* setup encryption mechanism (same as crypt) */ mech.cm_type = crypto_mech2id(crypt_info.ci_mechname); /* * Strangely, the ICP requires that plain_full_len must include * the MAC length when decrypting, even though the UIO does not * need to have the extra space allocated. */ if (encrypt) { plain_full_len = datalen; } else { plain_full_len = datalen + maclen; } /* * setup encryption params (currently only AES CCM and AES GCM * are supported) */ if (crypt_info.ci_crypt_type == ZC_TYPE_CCM) { ccmp.ulNonceSize = ZIO_DATA_IV_LEN; ccmp.ulAuthDataSize = auth_len; ccmp.authData = authbuf; ccmp.ulMACSize = maclen; ccmp.nonce = ivbuf; ccmp.ulDataSize = plain_full_len; mech.cm_param = (char *)(&ccmp); mech.cm_param_len = sizeof (CK_AES_CCM_PARAMS); } else { gcmp.ulIvLen = ZIO_DATA_IV_LEN; gcmp.ulIvBits = CRYPTO_BYTES2BITS(ZIO_DATA_IV_LEN); gcmp.ulAADLen = auth_len; gcmp.pAAD = authbuf; gcmp.ulTagBits = CRYPTO_BYTES2BITS(maclen); gcmp.pIv = ivbuf; mech.cm_param = (char *)(&gcmp); mech.cm_param_len = sizeof (CK_AES_GCM_PARAMS); } /* populate the cipher and plain data structs. */ plaindata.cd_format = CRYPTO_DATA_UIO; plaindata.cd_offset = 0; plaindata.cd_uio = puio; plaindata.cd_length = plain_full_len; cipherdata.cd_format = CRYPTO_DATA_UIO; cipherdata.cd_offset = 0; cipherdata.cd_uio = cuio; cipherdata.cd_length = datalen + maclen; /* perform the actual encryption */ if (encrypt) { ret = crypto_encrypt(&mech, &plaindata, key, tmpl, &cipherdata); if (ret != CRYPTO_SUCCESS) { ret = SET_ERROR(EIO); goto error; } } else { ret = crypto_decrypt(&mech, &cipherdata, key, tmpl, &plaindata); if (ret != CRYPTO_SUCCESS) { ASSERT3U(ret, ==, CRYPTO_INVALID_MAC); ret = SET_ERROR(ECKSUM); goto error; } } return (0); error: return (ret); } int zio_crypt_key_wrap(crypto_key_t *cwkey, zio_crypt_key_t *key, uint8_t *iv, uint8_t *mac, uint8_t *keydata_out, uint8_t *hmac_keydata_out) { int ret; zfs_uio_t puio, cuio; uint64_t aad[3]; iovec_t plain_iovecs[2], cipher_iovecs[3]; uint64_t crypt = key->zk_crypt; uint_t enc_len, keydata_len, aad_len; ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS); keydata_len = zio_crypt_table[crypt].ci_keylen; /* generate iv for wrapping the master and hmac key */ ret = random_get_pseudo_bytes(iv, WRAPPING_IV_LEN); if (ret != 0) goto error; /* initialize zfs_uio_ts */ plain_iovecs[0].iov_base = key->zk_master_keydata; plain_iovecs[0].iov_len = keydata_len; plain_iovecs[1].iov_base = key->zk_hmac_keydata; plain_iovecs[1].iov_len = SHA512_HMAC_KEYLEN; cipher_iovecs[0].iov_base = keydata_out; cipher_iovecs[0].iov_len = keydata_len; cipher_iovecs[1].iov_base = hmac_keydata_out; cipher_iovecs[1].iov_len = SHA512_HMAC_KEYLEN; cipher_iovecs[2].iov_base = mac; cipher_iovecs[2].iov_len = WRAPPING_MAC_LEN; /* * Although we don't support writing to the old format, we do * support rewrapping the key so that the user can move and * quarantine datasets on the old format. */ if (key->zk_version == 0) { aad_len = sizeof (uint64_t); aad[0] = LE_64(key->zk_guid); } else { ASSERT3U(key->zk_version, ==, ZIO_CRYPT_KEY_CURRENT_VERSION); aad_len = sizeof (uint64_t) * 3; aad[0] = LE_64(key->zk_guid); aad[1] = LE_64(crypt); aad[2] = LE_64(key->zk_version); } enc_len = zio_crypt_table[crypt].ci_keylen + SHA512_HMAC_KEYLEN; puio.uio_iov = plain_iovecs; puio.uio_iovcnt = 2; puio.uio_segflg = UIO_SYSSPACE; cuio.uio_iov = cipher_iovecs; cuio.uio_iovcnt = 3; cuio.uio_segflg = UIO_SYSSPACE; /* encrypt the keys and store the resulting ciphertext and mac */ ret = zio_do_crypt_uio(B_TRUE, crypt, cwkey, NULL, iv, enc_len, &puio, &cuio, (uint8_t *)aad, aad_len); if (ret != 0) goto error; return (0); error: return (ret); } int zio_crypt_key_unwrap(crypto_key_t *cwkey, uint64_t crypt, uint64_t version, uint64_t guid, uint8_t *keydata, uint8_t *hmac_keydata, uint8_t *iv, uint8_t *mac, zio_crypt_key_t *key) { crypto_mechanism_t mech; zfs_uio_t puio, cuio; uint64_t aad[3]; iovec_t plain_iovecs[2], cipher_iovecs[3]; uint_t enc_len, keydata_len, aad_len; int ret; ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS); rw_init(&key->zk_salt_lock, NULL, RW_DEFAULT, NULL); keydata_len = zio_crypt_table[crypt].ci_keylen; /* initialize zfs_uio_ts */ plain_iovecs[0].iov_base = key->zk_master_keydata; plain_iovecs[0].iov_len = keydata_len; plain_iovecs[1].iov_base = key->zk_hmac_keydata; plain_iovecs[1].iov_len = SHA512_HMAC_KEYLEN; cipher_iovecs[0].iov_base = keydata; cipher_iovecs[0].iov_len = keydata_len; cipher_iovecs[1].iov_base = hmac_keydata; cipher_iovecs[1].iov_len = SHA512_HMAC_KEYLEN; cipher_iovecs[2].iov_base = mac; cipher_iovecs[2].iov_len = WRAPPING_MAC_LEN; if (version == 0) { aad_len = sizeof (uint64_t); aad[0] = LE_64(guid); } else { ASSERT3U(version, ==, ZIO_CRYPT_KEY_CURRENT_VERSION); aad_len = sizeof (uint64_t) * 3; aad[0] = LE_64(guid); aad[1] = LE_64(crypt); aad[2] = LE_64(version); } enc_len = keydata_len + SHA512_HMAC_KEYLEN; puio.uio_iov = plain_iovecs; puio.uio_segflg = UIO_SYSSPACE; puio.uio_iovcnt = 2; cuio.uio_iov = cipher_iovecs; cuio.uio_iovcnt = 3; cuio.uio_segflg = UIO_SYSSPACE; /* decrypt the keys and store the result in the output buffers */ ret = zio_do_crypt_uio(B_FALSE, crypt, cwkey, NULL, iv, enc_len, &puio, &cuio, (uint8_t *)aad, aad_len); if (ret != 0) goto error; /* generate a fresh salt */ ret = random_get_bytes(key->zk_salt, ZIO_DATA_SALT_LEN); if (ret != 0) goto error; /* derive the current key from the master key */ ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0, key->zk_salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata, keydata_len); if (ret != 0) goto error; /* initialize keys for ICP */ key->zk_current_key.ck_data = key->zk_current_keydata; key->zk_current_key.ck_length = CRYPTO_BYTES2BITS(keydata_len); key->zk_hmac_key.ck_data = key->zk_hmac_keydata; key->zk_hmac_key.ck_length = CRYPTO_BYTES2BITS(SHA512_HMAC_KEYLEN); /* * Initialize the crypto templates. It's ok if this fails because * this is just an optimization. */ mech.cm_type = crypto_mech2id(zio_crypt_table[crypt].ci_mechname); ret = crypto_create_ctx_template(&mech, &key->zk_current_key, &key->zk_current_tmpl); if (ret != CRYPTO_SUCCESS) key->zk_current_tmpl = NULL; mech.cm_type = crypto_mech2id(SUN_CKM_SHA512_HMAC); ret = crypto_create_ctx_template(&mech, &key->zk_hmac_key, &key->zk_hmac_tmpl); if (ret != CRYPTO_SUCCESS) key->zk_hmac_tmpl = NULL; key->zk_crypt = crypt; key->zk_version = version; key->zk_guid = guid; key->zk_salt_count = 0; return (0); error: zio_crypt_key_destroy(key); return (ret); } int zio_crypt_generate_iv(uint8_t *ivbuf) { int ret; /* randomly generate the IV */ ret = random_get_pseudo_bytes(ivbuf, ZIO_DATA_IV_LEN); if (ret != 0) goto error; return (0); error: memset(ivbuf, 0, ZIO_DATA_IV_LEN); return (ret); } int zio_crypt_do_hmac(zio_crypt_key_t *key, uint8_t *data, uint_t datalen, uint8_t *digestbuf, uint_t digestlen) { int ret; crypto_mechanism_t mech; crypto_data_t in_data, digest_data; uint8_t raw_digestbuf[SHA512_DIGEST_LENGTH]; ASSERT3U(digestlen, <=, SHA512_DIGEST_LENGTH); /* initialize sha512-hmac mechanism and crypto data */ mech.cm_type = crypto_mech2id(SUN_CKM_SHA512_HMAC); mech.cm_param = NULL; mech.cm_param_len = 0; /* initialize the crypto data */ in_data.cd_format = CRYPTO_DATA_RAW; in_data.cd_offset = 0; in_data.cd_length = datalen; in_data.cd_raw.iov_base = (char *)data; in_data.cd_raw.iov_len = in_data.cd_length; digest_data.cd_format = CRYPTO_DATA_RAW; digest_data.cd_offset = 0; digest_data.cd_length = SHA512_DIGEST_LENGTH; digest_data.cd_raw.iov_base = (char *)raw_digestbuf; digest_data.cd_raw.iov_len = digest_data.cd_length; /* generate the hmac */ ret = crypto_mac(&mech, &in_data, &key->zk_hmac_key, key->zk_hmac_tmpl, &digest_data); if (ret != CRYPTO_SUCCESS) { ret = SET_ERROR(EIO); goto error; } memcpy(digestbuf, raw_digestbuf, digestlen); return (0); error: memset(digestbuf, 0, digestlen); return (ret); } int zio_crypt_generate_iv_salt_dedup(zio_crypt_key_t *key, uint8_t *data, uint_t datalen, uint8_t *ivbuf, uint8_t *salt) { int ret; uint8_t digestbuf[SHA512_DIGEST_LENGTH]; ret = zio_crypt_do_hmac(key, data, datalen, digestbuf, SHA512_DIGEST_LENGTH); if (ret != 0) return (ret); memcpy(salt, digestbuf, ZIO_DATA_SALT_LEN); memcpy(ivbuf, digestbuf + ZIO_DATA_SALT_LEN, ZIO_DATA_IV_LEN); return (0); } /* * The following functions are used to encode and decode encryption parameters * into blkptr_t and zil_header_t. The ICP wants to use these parameters as * byte strings, which normally means that these strings would not need to deal * with byteswapping at all. However, both blkptr_t and zil_header_t may be * byteswapped by lower layers and so we must "undo" that byteswap here upon * decoding and encoding in a non-native byteorder. These functions require * that the byteorder bit is correct before being called. */ void zio_crypt_encode_params_bp(blkptr_t *bp, uint8_t *salt, uint8_t *iv) { uint64_t val64; uint32_t val32; ASSERT(BP_IS_ENCRYPTED(bp)); if (!BP_SHOULD_BYTESWAP(bp)) { memcpy(&bp->blk_dva[2].dva_word[0], salt, sizeof (uint64_t)); memcpy(&bp->blk_dva[2].dva_word[1], iv, sizeof (uint64_t)); memcpy(&val32, iv + sizeof (uint64_t), sizeof (uint32_t)); BP_SET_IV2(bp, val32); } else { memcpy(&val64, salt, sizeof (uint64_t)); bp->blk_dva[2].dva_word[0] = BSWAP_64(val64); memcpy(&val64, iv, sizeof (uint64_t)); bp->blk_dva[2].dva_word[1] = BSWAP_64(val64); memcpy(&val32, iv + sizeof (uint64_t), sizeof (uint32_t)); BP_SET_IV2(bp, BSWAP_32(val32)); } } void zio_crypt_decode_params_bp(const blkptr_t *bp, uint8_t *salt, uint8_t *iv) { uint64_t val64; uint32_t val32; ASSERT(BP_IS_PROTECTED(bp)); /* for convenience, so callers don't need to check */ if (BP_IS_AUTHENTICATED(bp)) { memset(salt, 0, ZIO_DATA_SALT_LEN); memset(iv, 0, ZIO_DATA_IV_LEN); return; } if (!BP_SHOULD_BYTESWAP(bp)) { memcpy(salt, &bp->blk_dva[2].dva_word[0], sizeof (uint64_t)); memcpy(iv, &bp->blk_dva[2].dva_word[1], sizeof (uint64_t)); val32 = (uint32_t)BP_GET_IV2(bp); memcpy(iv + sizeof (uint64_t), &val32, sizeof (uint32_t)); } else { val64 = BSWAP_64(bp->blk_dva[2].dva_word[0]); memcpy(salt, &val64, sizeof (uint64_t)); val64 = BSWAP_64(bp->blk_dva[2].dva_word[1]); memcpy(iv, &val64, sizeof (uint64_t)); val32 = BSWAP_32((uint32_t)BP_GET_IV2(bp)); memcpy(iv + sizeof (uint64_t), &val32, sizeof (uint32_t)); } } void zio_crypt_encode_mac_bp(blkptr_t *bp, uint8_t *mac) { uint64_t val64; ASSERT(BP_USES_CRYPT(bp)); ASSERT3U(BP_GET_TYPE(bp), !=, DMU_OT_OBJSET); if (!BP_SHOULD_BYTESWAP(bp)) { memcpy(&bp->blk_cksum.zc_word[2], mac, sizeof (uint64_t)); memcpy(&bp->blk_cksum.zc_word[3], mac + sizeof (uint64_t), sizeof (uint64_t)); } else { memcpy(&val64, mac, sizeof (uint64_t)); bp->blk_cksum.zc_word[2] = BSWAP_64(val64); memcpy(&val64, mac + sizeof (uint64_t), sizeof (uint64_t)); bp->blk_cksum.zc_word[3] = BSWAP_64(val64); } } void zio_crypt_decode_mac_bp(const blkptr_t *bp, uint8_t *mac) { uint64_t val64; ASSERT(BP_USES_CRYPT(bp) || BP_IS_HOLE(bp)); /* for convenience, so callers don't need to check */ if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { memset(mac, 0, ZIO_DATA_MAC_LEN); return; } if (!BP_SHOULD_BYTESWAP(bp)) { memcpy(mac, &bp->blk_cksum.zc_word[2], sizeof (uint64_t)); memcpy(mac + sizeof (uint64_t), &bp->blk_cksum.zc_word[3], sizeof (uint64_t)); } else { val64 = BSWAP_64(bp->blk_cksum.zc_word[2]); memcpy(mac, &val64, sizeof (uint64_t)); val64 = BSWAP_64(bp->blk_cksum.zc_word[3]); memcpy(mac + sizeof (uint64_t), &val64, sizeof (uint64_t)); } } void zio_crypt_encode_mac_zil(void *data, uint8_t *mac) { zil_chain_t *zilc = data; memcpy(&zilc->zc_eck.zec_cksum.zc_word[2], mac, sizeof (uint64_t)); memcpy(&zilc->zc_eck.zec_cksum.zc_word[3], mac + sizeof (uint64_t), sizeof (uint64_t)); } void zio_crypt_decode_mac_zil(const void *data, uint8_t *mac) { /* * The ZIL MAC is embedded in the block it protects, which will * not have been byteswapped by the time this function has been called. * As a result, we don't need to worry about byteswapping the MAC. */ const zil_chain_t *zilc = data; memcpy(mac, &zilc->zc_eck.zec_cksum.zc_word[2], sizeof (uint64_t)); memcpy(mac + sizeof (uint64_t), &zilc->zc_eck.zec_cksum.zc_word[3], sizeof (uint64_t)); } /* * This routine takes a block of dnodes (src_abd) and copies only the bonus * buffers to the same offsets in the dst buffer. datalen should be the size * of both the src_abd and the dst buffer (not just the length of the bonus * buffers). */ void zio_crypt_copy_dnode_bonus(abd_t *src_abd, uint8_t *dst, uint_t datalen) { uint_t i, max_dnp = datalen >> DNODE_SHIFT; uint8_t *src; dnode_phys_t *dnp, *sdnp, *ddnp; src = abd_borrow_buf_copy(src_abd, datalen); sdnp = (dnode_phys_t *)src; ddnp = (dnode_phys_t *)dst; for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) { dnp = &sdnp[i]; if (dnp->dn_type != DMU_OT_NONE && DMU_OT_IS_ENCRYPTED(dnp->dn_bonustype) && dnp->dn_bonuslen != 0) { memcpy(DN_BONUS(&ddnp[i]), DN_BONUS(dnp), DN_MAX_BONUS_LEN(dnp)); } } abd_return_buf(src_abd, src, datalen); } /* * This function decides what fields from blk_prop are included in * the on-disk various MAC algorithms. */ static void zio_crypt_bp_zero_nonportable_blkprop(blkptr_t *bp, uint64_t version) { /* * Version 0 did not properly zero out all non-portable fields * as it should have done. We maintain this code so that we can * do read-only imports of pools on this version. */ if (version == 0) { BP_SET_DEDUP(bp, 0); BP_SET_CHECKSUM(bp, 0); BP_SET_PSIZE(bp, SPA_MINBLOCKSIZE); return; } ASSERT3U(version, ==, ZIO_CRYPT_KEY_CURRENT_VERSION); /* * The hole_birth feature might set these fields even if this bp * is a hole. We zero them out here to guarantee that raw sends * will function with or without the feature. */ if (BP_IS_HOLE(bp)) { bp->blk_prop = 0ULL; return; } /* * At L0 we want to verify these fields to ensure that data blocks * can not be reinterpreted. For instance, we do not want an attacker * to trick us into returning raw lz4 compressed data to the user * by modifying the compression bits. At higher levels, we cannot * enforce this policy since raw sends do not convey any information * about indirect blocks, so these values might be different on the * receive side. Fortunately, this does not open any new attack * vectors, since any alterations that can be made to a higher level * bp must still verify the correct order of the layer below it. */ if (BP_GET_LEVEL(bp) != 0) { BP_SET_BYTEORDER(bp, 0); BP_SET_COMPRESS(bp, 0); /* * psize cannot be set to zero or it will trigger * asserts, but the value doesn't really matter as * long as it is constant. */ BP_SET_PSIZE(bp, SPA_MINBLOCKSIZE); } BP_SET_DEDUP(bp, 0); BP_SET_CHECKSUM(bp, 0); } static void zio_crypt_bp_auth_init(uint64_t version, boolean_t should_bswap, blkptr_t *bp, blkptr_auth_buf_t *bab, uint_t *bab_len) { blkptr_t tmpbp = *bp; if (should_bswap) byteswap_uint64_array(&tmpbp, sizeof (blkptr_t)); ASSERT(BP_USES_CRYPT(&tmpbp) || BP_IS_HOLE(&tmpbp)); ASSERT0(BP_IS_EMBEDDED(&tmpbp)); zio_crypt_decode_mac_bp(&tmpbp, bab->bab_mac); /* * We always MAC blk_prop in LE to ensure portability. This * must be done after decoding the mac, since the endianness * will get zero'd out here. */ zio_crypt_bp_zero_nonportable_blkprop(&tmpbp, version); bab->bab_prop = LE_64(tmpbp.blk_prop); bab->bab_pad = 0ULL; /* version 0 did not include the padding */ *bab_len = sizeof (blkptr_auth_buf_t); if (version == 0) *bab_len -= sizeof (uint64_t); } static int zio_crypt_bp_do_hmac_updates(crypto_context_t ctx, uint64_t version, boolean_t should_bswap, blkptr_t *bp) { int ret; uint_t bab_len; blkptr_auth_buf_t bab; crypto_data_t cd; zio_crypt_bp_auth_init(version, should_bswap, bp, &bab, &bab_len); cd.cd_format = CRYPTO_DATA_RAW; cd.cd_offset = 0; cd.cd_length = bab_len; cd.cd_raw.iov_base = (char *)&bab; cd.cd_raw.iov_len = cd.cd_length; ret = crypto_mac_update(ctx, &cd); if (ret != CRYPTO_SUCCESS) { ret = SET_ERROR(EIO); goto error; } return (0); error: return (ret); } static void zio_crypt_bp_do_indrect_checksum_updates(SHA2_CTX *ctx, uint64_t version, boolean_t should_bswap, blkptr_t *bp) { uint_t bab_len; blkptr_auth_buf_t bab; zio_crypt_bp_auth_init(version, should_bswap, bp, &bab, &bab_len); SHA2Update(ctx, &bab, bab_len); } static void zio_crypt_bp_do_aad_updates(uint8_t **aadp, uint_t *aad_len, uint64_t version, boolean_t should_bswap, blkptr_t *bp) { uint_t bab_len; blkptr_auth_buf_t bab; zio_crypt_bp_auth_init(version, should_bswap, bp, &bab, &bab_len); memcpy(*aadp, &bab, bab_len); *aadp += bab_len; *aad_len += bab_len; } static int zio_crypt_do_dnode_hmac_updates(crypto_context_t ctx, uint64_t version, boolean_t should_bswap, dnode_phys_t *dnp) { int ret, i; dnode_phys_t *adnp, tmp_dncore; size_t dn_core_size = offsetof(dnode_phys_t, dn_blkptr); boolean_t le_bswap = (should_bswap == ZFS_HOST_BYTEORDER); crypto_data_t cd; cd.cd_format = CRYPTO_DATA_RAW; cd.cd_offset = 0; /* * Authenticate the core dnode (masking out non-portable bits). * We only copy the first 64 bytes we operate on to avoid the overhead * of copying 512-64 unneeded bytes. The compiler seems to be fine * with that. */ memcpy(&tmp_dncore, dnp, dn_core_size); adnp = &tmp_dncore; if (le_bswap) { adnp->dn_datablkszsec = BSWAP_16(adnp->dn_datablkszsec); adnp->dn_bonuslen = BSWAP_16(adnp->dn_bonuslen); adnp->dn_maxblkid = BSWAP_64(adnp->dn_maxblkid); adnp->dn_used = BSWAP_64(adnp->dn_used); } adnp->dn_flags &= DNODE_CRYPT_PORTABLE_FLAGS_MASK; adnp->dn_used = 0; cd.cd_length = dn_core_size; cd.cd_raw.iov_base = (char *)adnp; cd.cd_raw.iov_len = cd.cd_length; ret = crypto_mac_update(ctx, &cd); if (ret != CRYPTO_SUCCESS) { ret = SET_ERROR(EIO); goto error; } for (i = 0; i < dnp->dn_nblkptr; i++) { ret = zio_crypt_bp_do_hmac_updates(ctx, version, should_bswap, &dnp->dn_blkptr[i]); if (ret != 0) goto error; } if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { ret = zio_crypt_bp_do_hmac_updates(ctx, version, should_bswap, DN_SPILL_BLKPTR(dnp)); if (ret != 0) goto error; } return (0); error: return (ret); } /* * objset_phys_t blocks introduce a number of exceptions to the normal * authentication process. objset_phys_t's contain 2 separate HMACS for * protecting the integrity of their data. The portable_mac protects the * metadnode. This MAC can be sent with a raw send and protects against * reordering of data within the metadnode. The local_mac protects the user * accounting objects which are not sent from one system to another. * * In addition, objset blocks are the only blocks that can be modified and * written to disk without the key loaded under certain circumstances. During * zil_claim() we need to be able to update the zil_header_t to complete * claiming log blocks and during raw receives we need to write out the * portable_mac from the send file. Both of these actions are possible * because these fields are not protected by either MAC so neither one will * need to modify the MACs without the key. However, when the modified blocks * are written out they will be byteswapped into the host machine's native * endianness which will modify fields protected by the MAC. As a result, MAC * calculation for objset blocks works slightly differently from other block * types. Where other block types MAC the data in whatever endianness is * written to disk, objset blocks always MAC little endian version of their * values. In the code, should_bswap is the value from BP_SHOULD_BYTESWAP() * and le_bswap indicates whether a byteswap is needed to get this block * into little endian format. */ int zio_crypt_do_objset_hmacs(zio_crypt_key_t *key, void *data, uint_t datalen, boolean_t should_bswap, uint8_t *portable_mac, uint8_t *local_mac) { int ret; crypto_mechanism_t mech; crypto_context_t ctx; crypto_data_t cd; objset_phys_t *osp = data; uint64_t intval; boolean_t le_bswap = (should_bswap == ZFS_HOST_BYTEORDER); uint8_t raw_portable_mac[SHA512_DIGEST_LENGTH]; uint8_t raw_local_mac[SHA512_DIGEST_LENGTH]; /* initialize HMAC mechanism */ mech.cm_type = crypto_mech2id(SUN_CKM_SHA512_HMAC); mech.cm_param = NULL; mech.cm_param_len = 0; cd.cd_format = CRYPTO_DATA_RAW; cd.cd_offset = 0; /* calculate the portable MAC from the portable fields and metadnode */ ret = crypto_mac_init(&mech, &key->zk_hmac_key, NULL, &ctx); if (ret != CRYPTO_SUCCESS) { ret = SET_ERROR(EIO); goto error; } /* add in the os_type */ intval = (le_bswap) ? osp->os_type : BSWAP_64(osp->os_type); cd.cd_length = sizeof (uint64_t); cd.cd_raw.iov_base = (char *)&intval; cd.cd_raw.iov_len = cd.cd_length; ret = crypto_mac_update(ctx, &cd); if (ret != CRYPTO_SUCCESS) { ret = SET_ERROR(EIO); goto error; } /* add in the portable os_flags */ intval = osp->os_flags; if (should_bswap) intval = BSWAP_64(intval); intval &= OBJSET_CRYPT_PORTABLE_FLAGS_MASK; if (!ZFS_HOST_BYTEORDER) intval = BSWAP_64(intval); cd.cd_length = sizeof (uint64_t); cd.cd_raw.iov_base = (char *)&intval; cd.cd_raw.iov_len = cd.cd_length; ret = crypto_mac_update(ctx, &cd); if (ret != CRYPTO_SUCCESS) { ret = SET_ERROR(EIO); goto error; } /* add in fields from the metadnode */ ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version, should_bswap, &osp->os_meta_dnode); if (ret) goto error; /* store the final digest in a temporary buffer and copy what we need */ cd.cd_length = SHA512_DIGEST_LENGTH; cd.cd_raw.iov_base = (char *)raw_portable_mac; cd.cd_raw.iov_len = cd.cd_length; ret = crypto_mac_final(ctx, &cd); if (ret != CRYPTO_SUCCESS) { ret = SET_ERROR(EIO); goto error; } memcpy(portable_mac, raw_portable_mac, ZIO_OBJSET_MAC_LEN); /* * This is necessary here as we check next whether * OBJSET_FLAG_USERACCOUNTING_COMPLETE is set in order to * decide if the local_mac should be zeroed out. That flag will always * be set by dmu_objset_id_quota_upgrade_cb() and * dmu_objset_userspace_upgrade_cb() if useraccounting has been * completed. */ intval = osp->os_flags; if (should_bswap) intval = BSWAP_64(intval); boolean_t uacct_incomplete = !(intval & OBJSET_FLAG_USERACCOUNTING_COMPLETE); /* * The local MAC protects the user, group and project accounting. * If these objects are not present, the local MAC is zeroed out. */ if (uacct_incomplete || (datalen >= OBJSET_PHYS_SIZE_V3 && osp->os_userused_dnode.dn_type == DMU_OT_NONE && osp->os_groupused_dnode.dn_type == DMU_OT_NONE && osp->os_projectused_dnode.dn_type == DMU_OT_NONE) || (datalen >= OBJSET_PHYS_SIZE_V2 && osp->os_userused_dnode.dn_type == DMU_OT_NONE && osp->os_groupused_dnode.dn_type == DMU_OT_NONE) || (datalen <= OBJSET_PHYS_SIZE_V1)) { memset(local_mac, 0, ZIO_OBJSET_MAC_LEN); return (0); } /* calculate the local MAC from the userused and groupused dnodes */ ret = crypto_mac_init(&mech, &key->zk_hmac_key, NULL, &ctx); if (ret != CRYPTO_SUCCESS) { ret = SET_ERROR(EIO); goto error; } /* add in the non-portable os_flags */ intval = osp->os_flags; if (should_bswap) intval = BSWAP_64(intval); intval &= ~OBJSET_CRYPT_PORTABLE_FLAGS_MASK; if (!ZFS_HOST_BYTEORDER) intval = BSWAP_64(intval); cd.cd_length = sizeof (uint64_t); cd.cd_raw.iov_base = (char *)&intval; cd.cd_raw.iov_len = cd.cd_length; ret = crypto_mac_update(ctx, &cd); if (ret != CRYPTO_SUCCESS) { ret = SET_ERROR(EIO); goto error; } /* add in fields from the user accounting dnodes */ if (osp->os_userused_dnode.dn_type != DMU_OT_NONE) { ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version, should_bswap, &osp->os_userused_dnode); if (ret) goto error; } if (osp->os_groupused_dnode.dn_type != DMU_OT_NONE) { ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version, should_bswap, &osp->os_groupused_dnode); if (ret) goto error; } if (osp->os_projectused_dnode.dn_type != DMU_OT_NONE && datalen >= OBJSET_PHYS_SIZE_V3) { ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version, should_bswap, &osp->os_projectused_dnode); if (ret) goto error; } /* store the final digest in a temporary buffer and copy what we need */ cd.cd_length = SHA512_DIGEST_LENGTH; cd.cd_raw.iov_base = (char *)raw_local_mac; cd.cd_raw.iov_len = cd.cd_length; ret = crypto_mac_final(ctx, &cd); if (ret != CRYPTO_SUCCESS) { ret = SET_ERROR(EIO); goto error; } memcpy(local_mac, raw_local_mac, ZIO_OBJSET_MAC_LEN); return (0); error: memset(portable_mac, 0, ZIO_OBJSET_MAC_LEN); memset(local_mac, 0, ZIO_OBJSET_MAC_LEN); return (ret); } static void zio_crypt_destroy_uio(zfs_uio_t *uio) { if (uio->uio_iov) kmem_free(uio->uio_iov, uio->uio_iovcnt * sizeof (iovec_t)); } /* * This function parses an uncompressed indirect block and returns a checksum * of all the portable fields from all of the contained bps. The portable * fields are the MAC and all of the fields from blk_prop except for the dedup, * checksum, and psize bits. For an explanation of the purpose of this, see * the comment block on object set authentication. */ static int zio_crypt_do_indirect_mac_checksum_impl(boolean_t generate, void *buf, uint_t datalen, uint64_t version, boolean_t byteswap, uint8_t *cksum) { blkptr_t *bp; int i, epb = datalen >> SPA_BLKPTRSHIFT; SHA2_CTX ctx; uint8_t digestbuf[SHA512_DIGEST_LENGTH]; /* checksum all of the MACs from the layer below */ SHA2Init(SHA512, &ctx); for (i = 0, bp = buf; i < epb; i++, bp++) { zio_crypt_bp_do_indrect_checksum_updates(&ctx, version, byteswap, bp); } SHA2Final(digestbuf, &ctx); if (generate) { memcpy(cksum, digestbuf, ZIO_DATA_MAC_LEN); return (0); } if (memcmp(digestbuf, cksum, ZIO_DATA_MAC_LEN) != 0) return (SET_ERROR(ECKSUM)); return (0); } int zio_crypt_do_indirect_mac_checksum(boolean_t generate, void *buf, uint_t datalen, boolean_t byteswap, uint8_t *cksum) { int ret; /* * Unfortunately, callers of this function will not always have * easy access to the on-disk format version. This info is * normally found in the DSL Crypto Key, but the checksum-of-MACs * is expected to be verifiable even when the key isn't loaded. * Here, instead of doing a ZAP lookup for the version for each * zio, we simply try both existing formats. */ ret = zio_crypt_do_indirect_mac_checksum_impl(generate, buf, datalen, ZIO_CRYPT_KEY_CURRENT_VERSION, byteswap, cksum); if (ret == ECKSUM) { ASSERT(!generate); ret = zio_crypt_do_indirect_mac_checksum_impl(generate, buf, datalen, 0, byteswap, cksum); } return (ret); } int zio_crypt_do_indirect_mac_checksum_abd(boolean_t generate, abd_t *abd, uint_t datalen, boolean_t byteswap, uint8_t *cksum) { int ret; void *buf; buf = abd_borrow_buf_copy(abd, datalen); ret = zio_crypt_do_indirect_mac_checksum(generate, buf, datalen, byteswap, cksum); abd_return_buf(abd, buf, datalen); return (ret); } /* * Special case handling routine for encrypting / decrypting ZIL blocks. * We do not check for the older ZIL chain because the encryption feature * was not available before the newer ZIL chain was introduced. The goal * here is to encrypt everything except the blkptr_t of a lr_write_t and * the zil_chain_t header. Everything that is not encrypted is authenticated. */ static int zio_crypt_init_uios_zil(boolean_t encrypt, uint8_t *plainbuf, uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap, zfs_uio_t *puio, zfs_uio_t *cuio, uint_t *enc_len, uint8_t **authbuf, uint_t *auth_len, boolean_t *no_crypt) { int ret; uint64_t txtype, lr_len; uint_t nr_src, nr_dst, crypt_len; uint_t aad_len = 0, nr_iovecs = 0, total_len = 0; iovec_t *src_iovecs = NULL, *dst_iovecs = NULL; uint8_t *src, *dst, *slrp, *dlrp, *blkend, *aadp; zil_chain_t *zilc; lr_t *lr; uint8_t *aadbuf = zio_buf_alloc(datalen); /* cipherbuf always needs an extra iovec for the MAC */ if (encrypt) { src = plainbuf; dst = cipherbuf; nr_src = 0; nr_dst = 1; } else { src = cipherbuf; dst = plainbuf; nr_src = 1; nr_dst = 0; } memset(dst, 0, datalen); /* find the start and end record of the log block */ zilc = (zil_chain_t *)src; slrp = src + sizeof (zil_chain_t); aadp = aadbuf; blkend = src + ((byteswap) ? BSWAP_64(zilc->zc_nused) : zilc->zc_nused); /* calculate the number of encrypted iovecs we will need */ for (; slrp < blkend; slrp += lr_len) { lr = (lr_t *)slrp; if (!byteswap) { txtype = lr->lrc_txtype; lr_len = lr->lrc_reclen; } else { txtype = BSWAP_64(lr->lrc_txtype); lr_len = BSWAP_64(lr->lrc_reclen); } nr_iovecs++; if (txtype == TX_WRITE && lr_len != sizeof (lr_write_t)) nr_iovecs++; } nr_src += nr_iovecs; nr_dst += nr_iovecs; /* allocate the iovec arrays */ if (nr_src != 0) { src_iovecs = kmem_alloc(nr_src * sizeof (iovec_t), KM_SLEEP); if (src_iovecs == NULL) { ret = SET_ERROR(ENOMEM); goto error; } } if (nr_dst != 0) { dst_iovecs = kmem_alloc(nr_dst * sizeof (iovec_t), KM_SLEEP); if (dst_iovecs == NULL) { ret = SET_ERROR(ENOMEM); goto error; } } /* * Copy the plain zil header over and authenticate everything except * the checksum that will store our MAC. If we are writing the data * the embedded checksum will not have been calculated yet, so we don't * authenticate that. */ memcpy(dst, src, sizeof (zil_chain_t)); memcpy(aadp, src, sizeof (zil_chain_t) - sizeof (zio_eck_t)); aadp += sizeof (zil_chain_t) - sizeof (zio_eck_t); aad_len += sizeof (zil_chain_t) - sizeof (zio_eck_t); /* loop over records again, filling in iovecs */ nr_iovecs = 0; slrp = src + sizeof (zil_chain_t); dlrp = dst + sizeof (zil_chain_t); for (; slrp < blkend; slrp += lr_len, dlrp += lr_len) { lr = (lr_t *)slrp; if (!byteswap) { txtype = lr->lrc_txtype; lr_len = lr->lrc_reclen; } else { txtype = BSWAP_64(lr->lrc_txtype); lr_len = BSWAP_64(lr->lrc_reclen); } /* copy the common lr_t */ memcpy(dlrp, slrp, sizeof (lr_t)); memcpy(aadp, slrp, sizeof (lr_t)); aadp += sizeof (lr_t); aad_len += sizeof (lr_t); ASSERT3P(src_iovecs, !=, NULL); ASSERT3P(dst_iovecs, !=, NULL); /* * If this is a TX_WRITE record we want to encrypt everything * except the bp if exists. If the bp does exist we want to * authenticate it. */ if (txtype == TX_WRITE) { crypt_len = sizeof (lr_write_t) - sizeof (lr_t) - sizeof (blkptr_t); src_iovecs[nr_iovecs].iov_base = slrp + sizeof (lr_t); src_iovecs[nr_iovecs].iov_len = crypt_len; dst_iovecs[nr_iovecs].iov_base = dlrp + sizeof (lr_t); dst_iovecs[nr_iovecs].iov_len = crypt_len; /* copy the bp now since it will not be encrypted */ memcpy(dlrp + sizeof (lr_write_t) - sizeof (blkptr_t), slrp + sizeof (lr_write_t) - sizeof (blkptr_t), sizeof (blkptr_t)); memcpy(aadp, slrp + sizeof (lr_write_t) - sizeof (blkptr_t), sizeof (blkptr_t)); aadp += sizeof (blkptr_t); aad_len += sizeof (blkptr_t); nr_iovecs++; total_len += crypt_len; if (lr_len != sizeof (lr_write_t)) { crypt_len = lr_len - sizeof (lr_write_t); src_iovecs[nr_iovecs].iov_base = slrp + sizeof (lr_write_t); src_iovecs[nr_iovecs].iov_len = crypt_len; dst_iovecs[nr_iovecs].iov_base = dlrp + sizeof (lr_write_t); dst_iovecs[nr_iovecs].iov_len = crypt_len; nr_iovecs++; total_len += crypt_len; } } else { crypt_len = lr_len - sizeof (lr_t); src_iovecs[nr_iovecs].iov_base = slrp + sizeof (lr_t); src_iovecs[nr_iovecs].iov_len = crypt_len; dst_iovecs[nr_iovecs].iov_base = dlrp + sizeof (lr_t); dst_iovecs[nr_iovecs].iov_len = crypt_len; nr_iovecs++; total_len += crypt_len; } } *no_crypt = (nr_iovecs == 0); *enc_len = total_len; *authbuf = aadbuf; *auth_len = aad_len; if (encrypt) { puio->uio_iov = src_iovecs; puio->uio_iovcnt = nr_src; cuio->uio_iov = dst_iovecs; cuio->uio_iovcnt = nr_dst; } else { puio->uio_iov = dst_iovecs; puio->uio_iovcnt = nr_dst; cuio->uio_iov = src_iovecs; cuio->uio_iovcnt = nr_src; } return (0); error: zio_buf_free(aadbuf, datalen); if (src_iovecs != NULL) kmem_free(src_iovecs, nr_src * sizeof (iovec_t)); if (dst_iovecs != NULL) kmem_free(dst_iovecs, nr_dst * sizeof (iovec_t)); *enc_len = 0; *authbuf = NULL; *auth_len = 0; *no_crypt = B_FALSE; puio->uio_iov = NULL; puio->uio_iovcnt = 0; cuio->uio_iov = NULL; cuio->uio_iovcnt = 0; return (ret); } /* * Special case handling routine for encrypting / decrypting dnode blocks. */ static int zio_crypt_init_uios_dnode(boolean_t encrypt, uint64_t version, uint8_t *plainbuf, uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap, zfs_uio_t *puio, zfs_uio_t *cuio, uint_t *enc_len, uint8_t **authbuf, uint_t *auth_len, boolean_t *no_crypt) { int ret; uint_t nr_src, nr_dst, crypt_len; uint_t aad_len = 0, nr_iovecs = 0, total_len = 0; uint_t i, j, max_dnp = datalen >> DNODE_SHIFT; iovec_t *src_iovecs = NULL, *dst_iovecs = NULL; uint8_t *src, *dst, *aadp; dnode_phys_t *dnp, *adnp, *sdnp, *ddnp; uint8_t *aadbuf = zio_buf_alloc(datalen); if (encrypt) { src = plainbuf; dst = cipherbuf; nr_src = 0; nr_dst = 1; } else { src = cipherbuf; dst = plainbuf; nr_src = 1; nr_dst = 0; } sdnp = (dnode_phys_t *)src; ddnp = (dnode_phys_t *)dst; aadp = aadbuf; /* * Count the number of iovecs we will need to do the encryption by * counting the number of bonus buffers that need to be encrypted. */ for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) { /* * This block may still be byteswapped. However, all of the * values we use are either uint8_t's (for which byteswapping * is a noop) or a * != 0 check, which will work regardless * of whether or not we byteswap. */ if (sdnp[i].dn_type != DMU_OT_NONE && DMU_OT_IS_ENCRYPTED(sdnp[i].dn_bonustype) && sdnp[i].dn_bonuslen != 0) { nr_iovecs++; } } nr_src += nr_iovecs; nr_dst += nr_iovecs; if (nr_src != 0) { src_iovecs = kmem_alloc(nr_src * sizeof (iovec_t), KM_SLEEP); if (src_iovecs == NULL) { ret = SET_ERROR(ENOMEM); goto error; } } if (nr_dst != 0) { dst_iovecs = kmem_alloc(nr_dst * sizeof (iovec_t), KM_SLEEP); if (dst_iovecs == NULL) { ret = SET_ERROR(ENOMEM); goto error; } } nr_iovecs = 0; /* * Iterate through the dnodes again, this time filling in the uios * we allocated earlier. We also concatenate any data we want to * authenticate onto aadbuf. */ for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) { dnp = &sdnp[i]; /* copy over the core fields and blkptrs (kept as plaintext) */ memcpy(&ddnp[i], dnp, (uint8_t *)DN_BONUS(dnp) - (uint8_t *)dnp); if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { memcpy(DN_SPILL_BLKPTR(&ddnp[i]), DN_SPILL_BLKPTR(dnp), sizeof (blkptr_t)); } /* * Handle authenticated data. We authenticate everything in * the dnode that can be brought over when we do a raw send. * This includes all of the core fields as well as the MACs * stored in the bp checksums and all of the portable bits * from blk_prop. We include the dnode padding here in case it * ever gets used in the future. Some dn_flags and dn_used are * not portable so we mask those out values out of the * authenticated data. */ crypt_len = offsetof(dnode_phys_t, dn_blkptr); memcpy(aadp, dnp, crypt_len); adnp = (dnode_phys_t *)aadp; adnp->dn_flags &= DNODE_CRYPT_PORTABLE_FLAGS_MASK; adnp->dn_used = 0; aadp += crypt_len; aad_len += crypt_len; for (j = 0; j < dnp->dn_nblkptr; j++) { zio_crypt_bp_do_aad_updates(&aadp, &aad_len, version, byteswap, &dnp->dn_blkptr[j]); } if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { zio_crypt_bp_do_aad_updates(&aadp, &aad_len, version, byteswap, DN_SPILL_BLKPTR(dnp)); } /* * If this bonus buffer needs to be encrypted, we prepare an * iovec_t. The encryption / decryption functions will fill * this in for us with the encrypted or decrypted data. * Otherwise we add the bonus buffer to the authenticated * data buffer and copy it over to the destination. The * encrypted iovec extends to DN_MAX_BONUS_LEN(dnp) so that * we can guarantee alignment with the AES block size * (128 bits). */ crypt_len = DN_MAX_BONUS_LEN(dnp); if (dnp->dn_type != DMU_OT_NONE && DMU_OT_IS_ENCRYPTED(dnp->dn_bonustype) && dnp->dn_bonuslen != 0) { ASSERT3U(nr_iovecs, <, nr_src); ASSERT3U(nr_iovecs, <, nr_dst); ASSERT3P(src_iovecs, !=, NULL); ASSERT3P(dst_iovecs, !=, NULL); src_iovecs[nr_iovecs].iov_base = DN_BONUS(dnp); src_iovecs[nr_iovecs].iov_len = crypt_len; dst_iovecs[nr_iovecs].iov_base = DN_BONUS(&ddnp[i]); dst_iovecs[nr_iovecs].iov_len = crypt_len; nr_iovecs++; total_len += crypt_len; } else { memcpy(DN_BONUS(&ddnp[i]), DN_BONUS(dnp), crypt_len); memcpy(aadp, DN_BONUS(dnp), crypt_len); aadp += crypt_len; aad_len += crypt_len; } } *no_crypt = (nr_iovecs == 0); *enc_len = total_len; *authbuf = aadbuf; *auth_len = aad_len; if (encrypt) { puio->uio_iov = src_iovecs; puio->uio_iovcnt = nr_src; cuio->uio_iov = dst_iovecs; cuio->uio_iovcnt = nr_dst; } else { puio->uio_iov = dst_iovecs; puio->uio_iovcnt = nr_dst; cuio->uio_iov = src_iovecs; cuio->uio_iovcnt = nr_src; } return (0); error: zio_buf_free(aadbuf, datalen); if (src_iovecs != NULL) kmem_free(src_iovecs, nr_src * sizeof (iovec_t)); if (dst_iovecs != NULL) kmem_free(dst_iovecs, nr_dst * sizeof (iovec_t)); *enc_len = 0; *authbuf = NULL; *auth_len = 0; *no_crypt = B_FALSE; puio->uio_iov = NULL; puio->uio_iovcnt = 0; cuio->uio_iov = NULL; cuio->uio_iovcnt = 0; return (ret); } static int zio_crypt_init_uios_normal(boolean_t encrypt, uint8_t *plainbuf, uint8_t *cipherbuf, uint_t datalen, zfs_uio_t *puio, zfs_uio_t *cuio, uint_t *enc_len) { (void) encrypt; int ret; uint_t nr_plain = 1, nr_cipher = 2; iovec_t *plain_iovecs = NULL, *cipher_iovecs = NULL; /* allocate the iovecs for the plain and cipher data */ plain_iovecs = kmem_alloc(nr_plain * sizeof (iovec_t), KM_SLEEP); if (!plain_iovecs) { ret = SET_ERROR(ENOMEM); goto error; } cipher_iovecs = kmem_alloc(nr_cipher * sizeof (iovec_t), KM_SLEEP); if (!cipher_iovecs) { ret = SET_ERROR(ENOMEM); goto error; } plain_iovecs[0].iov_base = plainbuf; plain_iovecs[0].iov_len = datalen; cipher_iovecs[0].iov_base = cipherbuf; cipher_iovecs[0].iov_len = datalen; *enc_len = datalen; puio->uio_iov = plain_iovecs; puio->uio_iovcnt = nr_plain; cuio->uio_iov = cipher_iovecs; cuio->uio_iovcnt = nr_cipher; return (0); error: if (plain_iovecs != NULL) kmem_free(plain_iovecs, nr_plain * sizeof (iovec_t)); if (cipher_iovecs != NULL) kmem_free(cipher_iovecs, nr_cipher * sizeof (iovec_t)); *enc_len = 0; puio->uio_iov = NULL; puio->uio_iovcnt = 0; cuio->uio_iov = NULL; cuio->uio_iovcnt = 0; return (ret); } /* * This function builds up the plaintext (puio) and ciphertext (cuio) uios so * that they can be used for encryption and decryption by zio_do_crypt_uio(). * Most blocks will use zio_crypt_init_uios_normal(), with ZIL and dnode blocks * requiring special handling to parse out pieces that are to be encrypted. The * authbuf is used by these special cases to store additional authenticated * data (AAD) for the encryption modes. */ static int zio_crypt_init_uios(boolean_t encrypt, uint64_t version, dmu_object_type_t ot, uint8_t *plainbuf, uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap, uint8_t *mac, zfs_uio_t *puio, zfs_uio_t *cuio, uint_t *enc_len, uint8_t **authbuf, uint_t *auth_len, boolean_t *no_crypt) { int ret; iovec_t *mac_iov; ASSERT(DMU_OT_IS_ENCRYPTED(ot) || ot == DMU_OT_NONE); /* route to handler */ switch (ot) { case DMU_OT_INTENT_LOG: ret = zio_crypt_init_uios_zil(encrypt, plainbuf, cipherbuf, datalen, byteswap, puio, cuio, enc_len, authbuf, auth_len, no_crypt); break; case DMU_OT_DNODE: ret = zio_crypt_init_uios_dnode(encrypt, version, plainbuf, cipherbuf, datalen, byteswap, puio, cuio, enc_len, authbuf, auth_len, no_crypt); break; default: ret = zio_crypt_init_uios_normal(encrypt, plainbuf, cipherbuf, datalen, puio, cuio, enc_len); *authbuf = NULL; *auth_len = 0; *no_crypt = B_FALSE; break; } if (ret != 0) goto error; /* populate the uios */ puio->uio_segflg = UIO_SYSSPACE; cuio->uio_segflg = UIO_SYSSPACE; mac_iov = ((iovec_t *)&cuio->uio_iov[cuio->uio_iovcnt - 1]); mac_iov->iov_base = mac; mac_iov->iov_len = ZIO_DATA_MAC_LEN; return (0); error: return (ret); } /* * Primary encryption / decryption entrypoint for zio data. */ int zio_do_crypt_data(boolean_t encrypt, zio_crypt_key_t *key, dmu_object_type_t ot, boolean_t byteswap, uint8_t *salt, uint8_t *iv, uint8_t *mac, uint_t datalen, uint8_t *plainbuf, uint8_t *cipherbuf, boolean_t *no_crypt) { int ret; boolean_t locked = B_FALSE; uint64_t crypt = key->zk_crypt; uint_t keydata_len = zio_crypt_table[crypt].ci_keylen; uint_t enc_len, auth_len; zfs_uio_t puio, cuio; uint8_t enc_keydata[MASTER_KEY_MAX_LEN]; crypto_key_t tmp_ckey, *ckey = NULL; crypto_ctx_template_t tmpl; uint8_t *authbuf = NULL; memset(&puio, 0, sizeof (puio)); memset(&cuio, 0, sizeof (cuio)); /* * If the needed key is the current one, just use it. Otherwise we * need to generate a temporary one from the given salt + master key. * If we are encrypting, we must return a copy of the current salt * so that it can be stored in the blkptr_t. */ rw_enter(&key->zk_salt_lock, RW_READER); locked = B_TRUE; if (memcmp(salt, key->zk_salt, ZIO_DATA_SALT_LEN) == 0) { ckey = &key->zk_current_key; tmpl = key->zk_current_tmpl; } else { rw_exit(&key->zk_salt_lock); locked = B_FALSE; ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0, salt, ZIO_DATA_SALT_LEN, enc_keydata, keydata_len); if (ret != 0) goto error; tmp_ckey.ck_data = enc_keydata; tmp_ckey.ck_length = CRYPTO_BYTES2BITS(keydata_len); ckey = &tmp_ckey; tmpl = NULL; } /* * Attempt to use QAT acceleration if we can. We currently don't * do this for metadnode and ZIL blocks, since they have a much * more involved buffer layout and the qat_crypt() function only * works in-place. */ if (qat_crypt_use_accel(datalen) && ot != DMU_OT_INTENT_LOG && ot != DMU_OT_DNODE) { uint8_t *srcbuf, *dstbuf; if (encrypt) { srcbuf = plainbuf; dstbuf = cipherbuf; } else { srcbuf = cipherbuf; dstbuf = plainbuf; } ret = qat_crypt((encrypt) ? QAT_ENCRYPT : QAT_DECRYPT, srcbuf, dstbuf, NULL, 0, iv, mac, ckey, key->zk_crypt, datalen); if (ret == CPA_STATUS_SUCCESS) { if (locked) { rw_exit(&key->zk_salt_lock); locked = B_FALSE; } return (0); } /* If the hardware implementation fails fall back to software */ } /* create uios for encryption */ ret = zio_crypt_init_uios(encrypt, key->zk_version, ot, plainbuf, cipherbuf, datalen, byteswap, mac, &puio, &cuio, &enc_len, &authbuf, &auth_len, no_crypt); if (ret != 0) goto error; /* perform the encryption / decryption in software */ ret = zio_do_crypt_uio(encrypt, key->zk_crypt, ckey, tmpl, iv, enc_len, &puio, &cuio, authbuf, auth_len); if (ret != 0) goto error; if (locked) { rw_exit(&key->zk_salt_lock); - locked = B_FALSE; } if (authbuf != NULL) zio_buf_free(authbuf, datalen); if (ckey == &tmp_ckey) memset(enc_keydata, 0, keydata_len); zio_crypt_destroy_uio(&puio); zio_crypt_destroy_uio(&cuio); return (0); error: if (locked) rw_exit(&key->zk_salt_lock); if (authbuf != NULL) zio_buf_free(authbuf, datalen); if (ckey == &tmp_ckey) memset(enc_keydata, 0, keydata_len); zio_crypt_destroy_uio(&puio); zio_crypt_destroy_uio(&cuio); return (ret); } /* * Simple wrapper around zio_do_crypt_data() to work with abd's instead of * linear buffers. */ int zio_do_crypt_abd(boolean_t encrypt, zio_crypt_key_t *key, dmu_object_type_t ot, boolean_t byteswap, uint8_t *salt, uint8_t *iv, uint8_t *mac, uint_t datalen, abd_t *pabd, abd_t *cabd, boolean_t *no_crypt) { int ret; void *ptmp, *ctmp; if (encrypt) { ptmp = abd_borrow_buf_copy(pabd, datalen); ctmp = abd_borrow_buf(cabd, datalen); } else { ptmp = abd_borrow_buf(pabd, datalen); ctmp = abd_borrow_buf_copy(cabd, datalen); } ret = zio_do_crypt_data(encrypt, key, ot, byteswap, salt, iv, mac, datalen, ptmp, ctmp, no_crypt); if (ret != 0) goto error; if (encrypt) { abd_return_buf(pabd, ptmp, datalen); abd_return_buf_copy(cabd, ctmp, datalen); } else { abd_return_buf_copy(pabd, ptmp, datalen); abd_return_buf(cabd, ctmp, datalen); } return (0); error: if (encrypt) { abd_return_buf(pabd, ptmp, datalen); abd_return_buf_copy(cabd, ctmp, datalen); } else { abd_return_buf_copy(pabd, ptmp, datalen); abd_return_buf(cabd, ctmp, datalen); } return (ret); } #if defined(_KERNEL) /* CSTYLED */ module_param(zfs_key_max_salt_uses, ulong, 0644); MODULE_PARM_DESC(zfs_key_max_salt_uses, "Max number of times a salt value " "can be used for generating encryption keys before it is rotated"); #endif diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 58fb36207313..54cfb4bd3d04 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -1,11200 +1,11200 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2018, Joyent, Inc. * Copyright (c) 2011, 2020, Delphix. All rights reserved. * Copyright (c) 2014, Saso Kiselkov. All rights reserved. * Copyright (c) 2017, Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2019, loli10K . All rights reserved. * Copyright (c) 2020, George Amanakis. All rights reserved. * Copyright (c) 2019, Klara Inc. * Copyright (c) 2019, Allan Jude * Copyright (c) 2020, The FreeBSD Foundation [1] * * [1] Portions of this software were developed by Allan Jude * under sponsorship from the FreeBSD Foundation. */ /* * DVA-based Adjustable Replacement Cache * * While much of the theory of operation used here is * based on the self-tuning, low overhead replacement cache * presented by Megiddo and Modha at FAST 2003, there are some * significant differences: * * 1. The Megiddo and Modha model assumes any page is evictable. * Pages in its cache cannot be "locked" into memory. This makes * the eviction algorithm simple: evict the last page in the list. * This also make the performance characteristics easy to reason * about. Our cache is not so simple. At any given moment, some * subset of the blocks in the cache are un-evictable because we * have handed out a reference to them. Blocks are only evictable * when there are no external references active. This makes * eviction far more problematic: we choose to evict the evictable * blocks that are the "lowest" in the list. * * There are times when it is not possible to evict the requested * space. In these circumstances we are unable to adjust the cache * size. To prevent the cache growing unbounded at these times we * implement a "cache throttle" that slows the flow of new data * into the cache until we can make space available. * * 2. The Megiddo and Modha model assumes a fixed cache size. * Pages are evicted when the cache is full and there is a cache * miss. Our model has a variable sized cache. It grows with * high use, but also tries to react to memory pressure from the * operating system: decreasing its size when system memory is * tight. * * 3. The Megiddo and Modha model assumes a fixed page size. All * elements of the cache are therefore exactly the same size. So * when adjusting the cache size following a cache miss, its simply * a matter of choosing a single page to evict. In our model, we * have variable sized cache blocks (ranging from 512 bytes to * 128K bytes). We therefore choose a set of blocks to evict to make * space for a cache miss that approximates as closely as possible * the space used by the new block. * * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache" * by N. Megiddo & D. Modha, FAST 2003 */ /* * The locking model: * * A new reference to a cache buffer can be obtained in two * ways: 1) via a hash table lookup using the DVA as a key, * or 2) via one of the ARC lists. The arc_read() interface * uses method 1, while the internal ARC algorithms for * adjusting the cache use method 2. We therefore provide two * types of locks: 1) the hash table lock array, and 2) the * ARC list locks. * * Buffers do not have their own mutexes, rather they rely on the * hash table mutexes for the bulk of their protection (i.e. most * fields in the arc_buf_hdr_t are protected by these mutexes). * * buf_hash_find() returns the appropriate mutex (held) when it * locates the requested buffer in the hash table. It returns * NULL for the mutex if the buffer was not in the table. * * buf_hash_remove() expects the appropriate hash mutex to be * already held before it is invoked. * * Each ARC state also has a mutex which is used to protect the * buffer list associated with the state. When attempting to * obtain a hash table lock while holding an ARC list lock you * must use: mutex_tryenter() to avoid deadlock. Also note that * the active state mutex must be held before the ghost state mutex. * * It as also possible to register a callback which is run when the * arc_meta_limit is reached and no buffers can be safely evicted. In * this case the arc user should drop a reference on some arc buffers so * they can be reclaimed and the arc_meta_limit honored. For example, * when using the ZPL each dentry holds a references on a znode. These * dentries must be pruned before the arc buffer holding the znode can * be safely evicted. * * Note that the majority of the performance stats are manipulated * with atomic operations. * * The L2ARC uses the l2ad_mtx on each vdev for the following: * * - L2ARC buflist creation * - L2ARC buflist eviction * - L2ARC write completion, which walks L2ARC buflists * - ARC header destruction, as it removes from L2ARC buflists * - ARC header release, as it removes from L2ARC buflists */ /* * ARC operation: * * Every block that is in the ARC is tracked by an arc_buf_hdr_t structure. * This structure can point either to a block that is still in the cache or to * one that is only accessible in an L2 ARC device, or it can provide * information about a block that was recently evicted. If a block is * only accessible in the L2ARC, then the arc_buf_hdr_t only has enough * information to retrieve it from the L2ARC device. This information is * stored in the l2arc_buf_hdr_t sub-structure of the arc_buf_hdr_t. A block * that is in this state cannot access the data directly. * * Blocks that are actively being referenced or have not been evicted * are cached in the L1ARC. The L1ARC (l1arc_buf_hdr_t) is a structure within * the arc_buf_hdr_t that will point to the data block in memory. A block can * only be read by a consumer if it has an l1arc_buf_hdr_t. The L1ARC * caches data in two ways -- in a list of ARC buffers (arc_buf_t) and * also in the arc_buf_hdr_t's private physical data block pointer (b_pabd). * * The L1ARC's data pointer may or may not be uncompressed. The ARC has the * ability to store the physical data (b_pabd) associated with the DVA of the * arc_buf_hdr_t. Since the b_pabd is a copy of the on-disk physical block, * it will match its on-disk compression characteristics. This behavior can be * disabled by setting 'zfs_compressed_arc_enabled' to B_FALSE. When the * compressed ARC functionality is disabled, the b_pabd will point to an * uncompressed version of the on-disk data. * * Data in the L1ARC is not accessed by consumers of the ARC directly. Each * arc_buf_hdr_t can have multiple ARC buffers (arc_buf_t) which reference it. * Each ARC buffer (arc_buf_t) is being actively accessed by a specific ARC * consumer. The ARC will provide references to this data and will keep it * cached until it is no longer in use. The ARC caches only the L1ARC's physical * data block and will evict any arc_buf_t that is no longer referenced. The * amount of memory consumed by the arc_buf_ts' data buffers can be seen via the * "overhead_size" kstat. * * Depending on the consumer, an arc_buf_t can be requested in uncompressed or * compressed form. The typical case is that consumers will want uncompressed * data, and when that happens a new data buffer is allocated where the data is * decompressed for them to use. Currently the only consumer who wants * compressed arc_buf_t's is "zfs send", when it streams data exactly as it * exists on disk. When this happens, the arc_buf_t's data buffer is shared * with the arc_buf_hdr_t. * * Here is a diagram showing an arc_buf_hdr_t referenced by two arc_buf_t's. The * first one is owned by a compressed send consumer (and therefore references * the same compressed data buffer as the arc_buf_hdr_t) and the second could be * used by any other consumer (and has its own uncompressed copy of the data * buffer). * * arc_buf_hdr_t * +-----------+ * | fields | * | common to | * | L1- and | * | L2ARC | * +-----------+ * | l2arc_buf_hdr_t * | | * +-----------+ * | l1arc_buf_hdr_t * | | arc_buf_t * | b_buf +------------>+-----------+ arc_buf_t * | b_pabd +-+ |b_next +---->+-----------+ * +-----------+ | |-----------| |b_next +-->NULL * | |b_comp = T | +-----------+ * | |b_data +-+ |b_comp = F | * | +-----------+ | |b_data +-+ * +->+------+ | +-----------+ | * compressed | | | | * data | |<--------------+ | uncompressed * +------+ compressed, | data * shared +-->+------+ * data | | * | | * +------+ * * When a consumer reads a block, the ARC must first look to see if the * arc_buf_hdr_t is cached. If the hdr is cached then the ARC allocates a new * arc_buf_t and either copies uncompressed data into a new data buffer from an * existing uncompressed arc_buf_t, decompresses the hdr's b_pabd buffer into a * new data buffer, or shares the hdr's b_pabd buffer, depending on whether the * hdr is compressed and the desired compression characteristics of the * arc_buf_t consumer. If the arc_buf_t ends up sharing data with the * arc_buf_hdr_t and both of them are uncompressed then the arc_buf_t must be * the last buffer in the hdr's b_buf list, however a shared compressed buf can * be anywhere in the hdr's list. * * The diagram below shows an example of an uncompressed ARC hdr that is * sharing its data with an arc_buf_t (note that the shared uncompressed buf is * the last element in the buf list): * * arc_buf_hdr_t * +-----------+ * | | * | | * | | * +-----------+ * l2arc_buf_hdr_t| | * | | * +-----------+ * l1arc_buf_hdr_t| | * | | arc_buf_t (shared) * | b_buf +------------>+---------+ arc_buf_t * | | |b_next +---->+---------+ * | b_pabd +-+ |---------| |b_next +-->NULL * +-----------+ | | | +---------+ * | |b_data +-+ | | * | +---------+ | |b_data +-+ * +->+------+ | +---------+ | * | | | | * uncompressed | | | | * data +------+ | | * ^ +->+------+ | * | uncompressed | | | * | data | | | * | +------+ | * +---------------------------------+ * * Writing to the ARC requires that the ARC first discard the hdr's b_pabd * since the physical block is about to be rewritten. The new data contents * will be contained in the arc_buf_t. As the I/O pipeline performs the write, * it may compress the data before writing it to disk. The ARC will be called * with the transformed data and will memcpy the transformed on-disk block into * a newly allocated b_pabd. Writes are always done into buffers which have * either been loaned (and hence are new and don't have other readers) or * buffers which have been released (and hence have their own hdr, if there * were originally other readers of the buf's original hdr). This ensures that * the ARC only needs to update a single buf and its hdr after a write occurs. * * When the L2ARC is in use, it will also take advantage of the b_pabd. The * L2ARC will always write the contents of b_pabd to the L2ARC. This means * that when compressed ARC is enabled that the L2ARC blocks are identical * to the on-disk block in the main data pool. This provides a significant * advantage since the ARC can leverage the bp's checksum when reading from the * L2ARC to determine if the contents are valid. However, if the compressed * ARC is disabled, then the L2ARC's block must be transformed to look * like the physical block in the main data pool before comparing the * checksum and determining its validity. * * The L1ARC has a slightly different system for storing encrypted data. * Raw (encrypted + possibly compressed) data has a few subtle differences from * data that is just compressed. The biggest difference is that it is not * possible to decrypt encrypted data (or vice-versa) if the keys aren't loaded. * The other difference is that encryption cannot be treated as a suggestion. * If a caller would prefer compressed data, but they actually wind up with * uncompressed data the worst thing that could happen is there might be a * performance hit. If the caller requests encrypted data, however, we must be * sure they actually get it or else secret information could be leaked. Raw * data is stored in hdr->b_crypt_hdr.b_rabd. An encrypted header, therefore, * may have both an encrypted version and a decrypted version of its data at * once. When a caller needs a raw arc_buf_t, it is allocated and the data is * copied out of this header. To avoid complications with b_pabd, raw buffers * cannot be shared. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifndef _KERNEL /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */ boolean_t arc_watch = B_FALSE; #endif /* * This thread's job is to keep enough free memory in the system, by * calling arc_kmem_reap_soon() plus arc_reduce_target_size(), which improves * arc_available_memory(). */ static zthr_t *arc_reap_zthr; /* * This thread's job is to keep arc_size under arc_c, by calling * arc_evict(), which improves arc_is_overflowing(). */ static zthr_t *arc_evict_zthr; static arc_buf_hdr_t **arc_state_evict_markers; static int arc_state_evict_marker_count; static kmutex_t arc_evict_lock; static boolean_t arc_evict_needed = B_FALSE; /* * Count of bytes evicted since boot. */ static uint64_t arc_evict_count; /* * List of arc_evict_waiter_t's, representing threads waiting for the * arc_evict_count to reach specific values. */ static list_t arc_evict_waiters; /* * When arc_is_overflowing(), arc_get_data_impl() waits for this percent of * the requested amount of data to be evicted. For example, by default for * every 2KB that's evicted, 1KB of it may be "reused" by a new allocation. * Since this is above 100%, it ensures that progress is made towards getting * arc_size under arc_c. Since this is finite, it ensures that allocations * can still happen, even during the potentially long time that arc_size is * more than arc_c. */ static uint_t zfs_arc_eviction_pct = 200; /* * The number of headers to evict in arc_evict_state_impl() before * dropping the sublist lock and evicting from another sublist. A lower * value means we're more likely to evict the "correct" header (i.e. the * oldest header in the arc state), but comes with higher overhead * (i.e. more invocations of arc_evict_state_impl()). */ static uint_t zfs_arc_evict_batch_limit = 10; /* number of seconds before growing cache again */ uint_t arc_grow_retry = 5; /* * Minimum time between calls to arc_kmem_reap_soon(). */ static const int arc_kmem_cache_reap_retry_ms = 1000; /* shift of arc_c for calculating overflow limit in arc_get_data_impl */ static int zfs_arc_overflow_shift = 8; /* shift of arc_c for calculating both min and max arc_p */ static uint_t arc_p_min_shift = 4; /* log2(fraction of arc to reclaim) */ uint_t arc_shrink_shift = 7; /* percent of pagecache to reclaim arc to */ #ifdef _KERNEL uint_t zfs_arc_pc_percent = 0; #endif /* * log2(fraction of ARC which must be free to allow growing). * I.e. If there is less than arc_c >> arc_no_grow_shift free memory, * when reading a new block into the ARC, we will evict an equal-sized block * from the ARC. * * This must be less than arc_shrink_shift, so that when we shrink the ARC, * we will still not allow it to grow. */ uint_t arc_no_grow_shift = 5; /* * minimum lifespan of a prefetch block in clock ticks * (initialized in arc_init()) */ static uint_t arc_min_prefetch_ms; static uint_t arc_min_prescient_prefetch_ms; /* * If this percent of memory is free, don't throttle. */ uint_t arc_lotsfree_percent = 10; /* * The arc has filled available memory and has now warmed up. */ boolean_t arc_warm; /* * These tunables are for performance analysis. */ uint64_t zfs_arc_max = 0; uint64_t zfs_arc_min = 0; uint64_t zfs_arc_meta_limit = 0; uint64_t zfs_arc_meta_min = 0; static uint64_t zfs_arc_dnode_limit = 0; static uint_t zfs_arc_dnode_reduce_percent = 10; static uint_t zfs_arc_grow_retry = 0; static uint_t zfs_arc_shrink_shift = 0; static uint_t zfs_arc_p_min_shift = 0; uint_t zfs_arc_average_blocksize = 8 * 1024; /* 8KB */ /* * ARC dirty data constraints for arc_tempreserve_space() throttle: * * total dirty data limit * * anon block dirty limit * * each pool's anon allowance */ static const unsigned long zfs_arc_dirty_limit_percent = 50; static const unsigned long zfs_arc_anon_limit_percent = 25; static const unsigned long zfs_arc_pool_dirty_percent = 20; /* * Enable or disable compressed arc buffers. */ int zfs_compressed_arc_enabled = B_TRUE; /* * ARC will evict meta buffers that exceed arc_meta_limit. This * tunable make arc_meta_limit adjustable for different workloads. */ static uint64_t zfs_arc_meta_limit_percent = 75; /* * Percentage that can be consumed by dnodes of ARC meta buffers. */ static uint_t zfs_arc_dnode_limit_percent = 10; /* * These tunables are Linux-specific */ static uint64_t zfs_arc_sys_free = 0; static uint_t zfs_arc_min_prefetch_ms = 0; static uint_t zfs_arc_min_prescient_prefetch_ms = 0; static int zfs_arc_p_dampener_disable = 1; static uint_t zfs_arc_meta_prune = 10000; static uint_t zfs_arc_meta_strategy = ARC_STRATEGY_META_BALANCED; static uint_t zfs_arc_meta_adjust_restarts = 4096; static uint_t zfs_arc_lotsfree_percent = 10; /* * Number of arc_prune threads */ static int zfs_arc_prune_task_threads = 1; /* The 6 states: */ arc_state_t ARC_anon; arc_state_t ARC_mru; arc_state_t ARC_mru_ghost; arc_state_t ARC_mfu; arc_state_t ARC_mfu_ghost; arc_state_t ARC_l2c_only; arc_stats_t arc_stats = { { "hits", KSTAT_DATA_UINT64 }, { "misses", KSTAT_DATA_UINT64 }, { "demand_data_hits", KSTAT_DATA_UINT64 }, { "demand_data_misses", KSTAT_DATA_UINT64 }, { "demand_metadata_hits", KSTAT_DATA_UINT64 }, { "demand_metadata_misses", KSTAT_DATA_UINT64 }, { "prefetch_data_hits", KSTAT_DATA_UINT64 }, { "prefetch_data_misses", KSTAT_DATA_UINT64 }, { "prefetch_metadata_hits", KSTAT_DATA_UINT64 }, { "prefetch_metadata_misses", KSTAT_DATA_UINT64 }, { "mru_hits", KSTAT_DATA_UINT64 }, { "mru_ghost_hits", KSTAT_DATA_UINT64 }, { "mfu_hits", KSTAT_DATA_UINT64 }, { "mfu_ghost_hits", KSTAT_DATA_UINT64 }, { "deleted", KSTAT_DATA_UINT64 }, { "mutex_miss", KSTAT_DATA_UINT64 }, { "access_skip", KSTAT_DATA_UINT64 }, { "evict_skip", KSTAT_DATA_UINT64 }, { "evict_not_enough", KSTAT_DATA_UINT64 }, { "evict_l2_cached", KSTAT_DATA_UINT64 }, { "evict_l2_eligible", KSTAT_DATA_UINT64 }, { "evict_l2_eligible_mfu", KSTAT_DATA_UINT64 }, { "evict_l2_eligible_mru", KSTAT_DATA_UINT64 }, { "evict_l2_ineligible", KSTAT_DATA_UINT64 }, { "evict_l2_skip", KSTAT_DATA_UINT64 }, { "hash_elements", KSTAT_DATA_UINT64 }, { "hash_elements_max", KSTAT_DATA_UINT64 }, { "hash_collisions", KSTAT_DATA_UINT64 }, { "hash_chains", KSTAT_DATA_UINT64 }, { "hash_chain_max", KSTAT_DATA_UINT64 }, { "p", KSTAT_DATA_UINT64 }, { "c", KSTAT_DATA_UINT64 }, { "c_min", KSTAT_DATA_UINT64 }, { "c_max", KSTAT_DATA_UINT64 }, { "size", KSTAT_DATA_UINT64 }, { "compressed_size", KSTAT_DATA_UINT64 }, { "uncompressed_size", KSTAT_DATA_UINT64 }, { "overhead_size", KSTAT_DATA_UINT64 }, { "hdr_size", KSTAT_DATA_UINT64 }, { "data_size", KSTAT_DATA_UINT64 }, { "metadata_size", KSTAT_DATA_UINT64 }, { "dbuf_size", KSTAT_DATA_UINT64 }, { "dnode_size", KSTAT_DATA_UINT64 }, { "bonus_size", KSTAT_DATA_UINT64 }, #if defined(COMPAT_FREEBSD11) { "other_size", KSTAT_DATA_UINT64 }, #endif { "anon_size", KSTAT_DATA_UINT64 }, { "anon_evictable_data", KSTAT_DATA_UINT64 }, { "anon_evictable_metadata", KSTAT_DATA_UINT64 }, { "mru_size", KSTAT_DATA_UINT64 }, { "mru_evictable_data", KSTAT_DATA_UINT64 }, { "mru_evictable_metadata", KSTAT_DATA_UINT64 }, { "mru_ghost_size", KSTAT_DATA_UINT64 }, { "mru_ghost_evictable_data", KSTAT_DATA_UINT64 }, { "mru_ghost_evictable_metadata", KSTAT_DATA_UINT64 }, { "mfu_size", KSTAT_DATA_UINT64 }, { "mfu_evictable_data", KSTAT_DATA_UINT64 }, { "mfu_evictable_metadata", KSTAT_DATA_UINT64 }, { "mfu_ghost_size", KSTAT_DATA_UINT64 }, { "mfu_ghost_evictable_data", KSTAT_DATA_UINT64 }, { "mfu_ghost_evictable_metadata", KSTAT_DATA_UINT64 }, { "l2_hits", KSTAT_DATA_UINT64 }, { "l2_misses", KSTAT_DATA_UINT64 }, { "l2_prefetch_asize", KSTAT_DATA_UINT64 }, { "l2_mru_asize", KSTAT_DATA_UINT64 }, { "l2_mfu_asize", KSTAT_DATA_UINT64 }, { "l2_bufc_data_asize", KSTAT_DATA_UINT64 }, { "l2_bufc_metadata_asize", KSTAT_DATA_UINT64 }, { "l2_feeds", KSTAT_DATA_UINT64 }, { "l2_rw_clash", KSTAT_DATA_UINT64 }, { "l2_read_bytes", KSTAT_DATA_UINT64 }, { "l2_write_bytes", KSTAT_DATA_UINT64 }, { "l2_writes_sent", KSTAT_DATA_UINT64 }, { "l2_writes_done", KSTAT_DATA_UINT64 }, { "l2_writes_error", KSTAT_DATA_UINT64 }, { "l2_writes_lock_retry", KSTAT_DATA_UINT64 }, { "l2_evict_lock_retry", KSTAT_DATA_UINT64 }, { "l2_evict_reading", KSTAT_DATA_UINT64 }, { "l2_evict_l1cached", KSTAT_DATA_UINT64 }, { "l2_free_on_write", KSTAT_DATA_UINT64 }, { "l2_abort_lowmem", KSTAT_DATA_UINT64 }, { "l2_cksum_bad", KSTAT_DATA_UINT64 }, { "l2_io_error", KSTAT_DATA_UINT64 }, { "l2_size", KSTAT_DATA_UINT64 }, { "l2_asize", KSTAT_DATA_UINT64 }, { "l2_hdr_size", KSTAT_DATA_UINT64 }, { "l2_log_blk_writes", KSTAT_DATA_UINT64 }, { "l2_log_blk_avg_asize", KSTAT_DATA_UINT64 }, { "l2_log_blk_asize", KSTAT_DATA_UINT64 }, { "l2_log_blk_count", KSTAT_DATA_UINT64 }, { "l2_data_to_meta_ratio", KSTAT_DATA_UINT64 }, { "l2_rebuild_success", KSTAT_DATA_UINT64 }, { "l2_rebuild_unsupported", KSTAT_DATA_UINT64 }, { "l2_rebuild_io_errors", KSTAT_DATA_UINT64 }, { "l2_rebuild_dh_errors", KSTAT_DATA_UINT64 }, { "l2_rebuild_cksum_lb_errors", KSTAT_DATA_UINT64 }, { "l2_rebuild_lowmem", KSTAT_DATA_UINT64 }, { "l2_rebuild_size", KSTAT_DATA_UINT64 }, { "l2_rebuild_asize", KSTAT_DATA_UINT64 }, { "l2_rebuild_bufs", KSTAT_DATA_UINT64 }, { "l2_rebuild_bufs_precached", KSTAT_DATA_UINT64 }, { "l2_rebuild_log_blks", KSTAT_DATA_UINT64 }, { "memory_throttle_count", KSTAT_DATA_UINT64 }, { "memory_direct_count", KSTAT_DATA_UINT64 }, { "memory_indirect_count", KSTAT_DATA_UINT64 }, { "memory_all_bytes", KSTAT_DATA_UINT64 }, { "memory_free_bytes", KSTAT_DATA_UINT64 }, { "memory_available_bytes", KSTAT_DATA_INT64 }, { "arc_no_grow", KSTAT_DATA_UINT64 }, { "arc_tempreserve", KSTAT_DATA_UINT64 }, { "arc_loaned_bytes", KSTAT_DATA_UINT64 }, { "arc_prune", KSTAT_DATA_UINT64 }, { "arc_meta_used", KSTAT_DATA_UINT64 }, { "arc_meta_limit", KSTAT_DATA_UINT64 }, { "arc_dnode_limit", KSTAT_DATA_UINT64 }, { "arc_meta_max", KSTAT_DATA_UINT64 }, { "arc_meta_min", KSTAT_DATA_UINT64 }, { "async_upgrade_sync", KSTAT_DATA_UINT64 }, { "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 }, { "demand_hit_prescient_prefetch", KSTAT_DATA_UINT64 }, { "arc_need_free", KSTAT_DATA_UINT64 }, { "arc_sys_free", KSTAT_DATA_UINT64 }, { "arc_raw_size", KSTAT_DATA_UINT64 }, { "cached_only_in_progress", KSTAT_DATA_UINT64 }, { "abd_chunk_waste_size", KSTAT_DATA_UINT64 }, }; arc_sums_t arc_sums; #define ARCSTAT_MAX(stat, val) { \ uint64_t m; \ while ((val) > (m = arc_stats.stat.value.ui64) && \ (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \ continue; \ } /* * We define a macro to allow ARC hits/misses to be easily broken down by * two separate conditions, giving a total of four different subtypes for * each of hits and misses (so eight statistics total). */ #define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \ if (cond1) { \ if (cond2) { \ ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \ } else { \ ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \ } \ } else { \ if (cond2) { \ ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \ } else { \ ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\ } \ } /* * This macro allows us to use kstats as floating averages. Each time we * update this kstat, we first factor it and the update value by * ARCSTAT_AVG_FACTOR to shrink the new value's contribution to the overall * average. This macro assumes that integer loads and stores are atomic, but * is not safe for multiple writers updating the kstat in parallel (only the * last writer's update will remain). */ #define ARCSTAT_F_AVG_FACTOR 3 #define ARCSTAT_F_AVG(stat, value) \ do { \ uint64_t x = ARCSTAT(stat); \ x = x - x / ARCSTAT_F_AVG_FACTOR + \ (value) / ARCSTAT_F_AVG_FACTOR; \ ARCSTAT(stat) = x; \ } while (0) static kstat_t *arc_ksp; /* * There are several ARC variables that are critical to export as kstats -- * but we don't want to have to grovel around in the kstat whenever we wish to * manipulate them. For these variables, we therefore define them to be in * terms of the statistic variable. This assures that we are not introducing * the possibility of inconsistency by having shadow copies of the variables, * while still allowing the code to be readable. */ #define arc_tempreserve ARCSTAT(arcstat_tempreserve) #define arc_loaned_bytes ARCSTAT(arcstat_loaned_bytes) #define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */ /* max size for dnodes */ #define arc_dnode_size_limit ARCSTAT(arcstat_dnode_limit) #define arc_meta_min ARCSTAT(arcstat_meta_min) /* min size for metadata */ #define arc_need_free ARCSTAT(arcstat_need_free) /* waiting to be evicted */ hrtime_t arc_growtime; list_t arc_prune_list; kmutex_t arc_prune_mtx; taskq_t *arc_prune_taskq; #define GHOST_STATE(state) \ ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \ (state) == arc_l2c_only) #define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_FLAG_IN_HASH_TABLE) #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) #define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_FLAG_IO_ERROR) #define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_FLAG_PREFETCH) #define HDR_PRESCIENT_PREFETCH(hdr) \ ((hdr)->b_flags & ARC_FLAG_PRESCIENT_PREFETCH) #define HDR_COMPRESSION_ENABLED(hdr) \ ((hdr)->b_flags & ARC_FLAG_COMPRESSED_ARC) #define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_FLAG_L2CACHE) #define HDR_L2_READING(hdr) \ (((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) && \ ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)) #define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITING) #define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_FLAG_L2_EVICTED) #define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD) #define HDR_PROTECTED(hdr) ((hdr)->b_flags & ARC_FLAG_PROTECTED) #define HDR_NOAUTH(hdr) ((hdr)->b_flags & ARC_FLAG_NOAUTH) #define HDR_SHARED_DATA(hdr) ((hdr)->b_flags & ARC_FLAG_SHARED_DATA) #define HDR_ISTYPE_METADATA(hdr) \ ((hdr)->b_flags & ARC_FLAG_BUFC_METADATA) #define HDR_ISTYPE_DATA(hdr) (!HDR_ISTYPE_METADATA(hdr)) #define HDR_HAS_L1HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L1HDR) #define HDR_HAS_L2HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR) #define HDR_HAS_RABD(hdr) \ (HDR_HAS_L1HDR(hdr) && HDR_PROTECTED(hdr) && \ (hdr)->b_crypt_hdr.b_rabd != NULL) #define HDR_ENCRYPTED(hdr) \ (HDR_PROTECTED(hdr) && DMU_OT_IS_ENCRYPTED((hdr)->b_crypt_hdr.b_ot)) #define HDR_AUTHENTICATED(hdr) \ (HDR_PROTECTED(hdr) && !DMU_OT_IS_ENCRYPTED((hdr)->b_crypt_hdr.b_ot)) /* For storing compression mode in b_flags */ #define HDR_COMPRESS_OFFSET (highbit64(ARC_FLAG_COMPRESS_0) - 1) #define HDR_GET_COMPRESS(hdr) ((enum zio_compress)BF32_GET((hdr)->b_flags, \ HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS)) #define HDR_SET_COMPRESS(hdr, cmp) BF32_SET((hdr)->b_flags, \ HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS, (cmp)); #define ARC_BUF_LAST(buf) ((buf)->b_next == NULL) #define ARC_BUF_SHARED(buf) ((buf)->b_flags & ARC_BUF_FLAG_SHARED) #define ARC_BUF_COMPRESSED(buf) ((buf)->b_flags & ARC_BUF_FLAG_COMPRESSED) #define ARC_BUF_ENCRYPTED(buf) ((buf)->b_flags & ARC_BUF_FLAG_ENCRYPTED) /* * Other sizes */ #define HDR_FULL_CRYPT_SIZE ((int64_t)sizeof (arc_buf_hdr_t)) #define HDR_FULL_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_crypt_hdr)) #define HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr)) /* * Hash table routines */ #define BUF_LOCKS 2048 typedef struct buf_hash_table { uint64_t ht_mask; arc_buf_hdr_t **ht_table; kmutex_t ht_locks[BUF_LOCKS] ____cacheline_aligned; } buf_hash_table_t; static buf_hash_table_t buf_hash_table; #define BUF_HASH_INDEX(spa, dva, birth) \ (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask) #define BUF_HASH_LOCK(idx) (&buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)]) #define HDR_LOCK(hdr) \ (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth))) uint64_t zfs_crc64_table[256]; /* * Level 2 ARC */ #define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */ #define L2ARC_HEADROOM 2 /* num of writes */ /* * If we discover during ARC scan any buffers to be compressed, we boost * our headroom for the next scanning cycle by this percentage multiple. */ #define L2ARC_HEADROOM_BOOST 200 #define L2ARC_FEED_SECS 1 /* caching interval secs */ #define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */ /* * We can feed L2ARC from two states of ARC buffers, mru and mfu, * and each of the state has two types: data and metadata. */ #define L2ARC_FEED_TYPES 4 /* L2ARC Performance Tunables */ uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* def max write size */ uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra warmup write */ uint64_t l2arc_headroom = L2ARC_HEADROOM; /* # of dev writes */ uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST; uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */ uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval msecs */ int l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */ int l2arc_feed_again = B_TRUE; /* turbo warmup */ int l2arc_norw = B_FALSE; /* no reads during writes */ static uint_t l2arc_meta_percent = 33; /* limit on headers size */ /* * L2ARC Internals */ static list_t L2ARC_dev_list; /* device list */ static list_t *l2arc_dev_list; /* device list pointer */ static kmutex_t l2arc_dev_mtx; /* device list mutex */ static l2arc_dev_t *l2arc_dev_last; /* last device used */ static list_t L2ARC_free_on_write; /* free after write buf list */ static list_t *l2arc_free_on_write; /* free after write list ptr */ static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */ static uint64_t l2arc_ndev; /* number of devices */ typedef struct l2arc_read_callback { arc_buf_hdr_t *l2rcb_hdr; /* read header */ blkptr_t l2rcb_bp; /* original blkptr */ zbookmark_phys_t l2rcb_zb; /* original bookmark */ int l2rcb_flags; /* original flags */ abd_t *l2rcb_abd; /* temporary buffer */ } l2arc_read_callback_t; typedef struct l2arc_data_free { /* protected by l2arc_free_on_write_mtx */ abd_t *l2df_abd; size_t l2df_size; arc_buf_contents_t l2df_type; list_node_t l2df_list_node; } l2arc_data_free_t; typedef enum arc_fill_flags { ARC_FILL_LOCKED = 1 << 0, /* hdr lock is held */ ARC_FILL_COMPRESSED = 1 << 1, /* fill with compressed data */ ARC_FILL_ENCRYPTED = 1 << 2, /* fill with encrypted data */ ARC_FILL_NOAUTH = 1 << 3, /* don't attempt to authenticate */ ARC_FILL_IN_PLACE = 1 << 4 /* fill in place (special case) */ } arc_fill_flags_t; typedef enum arc_ovf_level { ARC_OVF_NONE, /* ARC within target size. */ ARC_OVF_SOME, /* ARC is slightly overflowed. */ ARC_OVF_SEVERE /* ARC is severely overflowed. */ } arc_ovf_level_t; static kmutex_t l2arc_feed_thr_lock; static kcondvar_t l2arc_feed_thr_cv; static uint8_t l2arc_thread_exit; static kmutex_t l2arc_rebuild_thr_lock; static kcondvar_t l2arc_rebuild_thr_cv; enum arc_hdr_alloc_flags { ARC_HDR_ALLOC_RDATA = 0x1, ARC_HDR_DO_ADAPT = 0x2, ARC_HDR_USE_RESERVE = 0x4, }; static abd_t *arc_get_data_abd(arc_buf_hdr_t *, uint64_t, const void *, int); static void *arc_get_data_buf(arc_buf_hdr_t *, uint64_t, const void *); static void arc_get_data_impl(arc_buf_hdr_t *, uint64_t, const void *, int); static void arc_free_data_abd(arc_buf_hdr_t *, abd_t *, uint64_t, const void *); static void arc_free_data_buf(arc_buf_hdr_t *, void *, uint64_t, const void *); static void arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, const void *tag); static void arc_hdr_free_abd(arc_buf_hdr_t *, boolean_t); static void arc_hdr_alloc_abd(arc_buf_hdr_t *, int); static void arc_access(arc_buf_hdr_t *, kmutex_t *); static void arc_buf_watch(arc_buf_t *); static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *); static uint32_t arc_bufc_to_flags(arc_buf_contents_t); static inline void arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags); static inline void arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags); static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *); static void l2arc_read_done(zio_t *); static void l2arc_do_free_on_write(void); static void l2arc_hdr_arcstats_update(arc_buf_hdr_t *hdr, boolean_t incr, boolean_t state_only); #define l2arc_hdr_arcstats_increment(hdr) \ l2arc_hdr_arcstats_update((hdr), B_TRUE, B_FALSE) #define l2arc_hdr_arcstats_decrement(hdr) \ l2arc_hdr_arcstats_update((hdr), B_FALSE, B_FALSE) #define l2arc_hdr_arcstats_increment_state(hdr) \ l2arc_hdr_arcstats_update((hdr), B_TRUE, B_TRUE) #define l2arc_hdr_arcstats_decrement_state(hdr) \ l2arc_hdr_arcstats_update((hdr), B_FALSE, B_TRUE) /* * l2arc_exclude_special : A zfs module parameter that controls whether buffers * present on special vdevs are eligibile for caching in L2ARC. If * set to 1, exclude dbufs on special vdevs from being cached to * L2ARC. */ int l2arc_exclude_special = 0; /* * l2arc_mfuonly : A ZFS module parameter that controls whether only MFU * metadata and data are cached from ARC into L2ARC. */ static int l2arc_mfuonly = 0; /* * L2ARC TRIM * l2arc_trim_ahead : A ZFS module parameter that controls how much ahead of * the current write size (l2arc_write_max) we should TRIM if we * have filled the device. It is defined as a percentage of the * write size. If set to 100 we trim twice the space required to * accommodate upcoming writes. A minimum of 64MB will be trimmed. * It also enables TRIM of the whole L2ARC device upon creation or * addition to an existing pool or if the header of the device is * invalid upon importing a pool or onlining a cache device. The * default is 0, which disables TRIM on L2ARC altogether as it can * put significant stress on the underlying storage devices. This * will vary depending of how well the specific device handles * these commands. */ static uint64_t l2arc_trim_ahead = 0; /* * Performance tuning of L2ARC persistence: * * l2arc_rebuild_enabled : A ZFS module parameter that controls whether adding * an L2ARC device (either at pool import or later) will attempt * to rebuild L2ARC buffer contents. * l2arc_rebuild_blocks_min_l2size : A ZFS module parameter that controls * whether log blocks are written to the L2ARC device. If the L2ARC * device is less than 1GB, the amount of data l2arc_evict() * evicts is significant compared to the amount of restored L2ARC * data. In this case do not write log blocks in L2ARC in order * not to waste space. */ static int l2arc_rebuild_enabled = B_TRUE; static uint64_t l2arc_rebuild_blocks_min_l2size = 1024 * 1024 * 1024; /* L2ARC persistence rebuild control routines. */ void l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen); static __attribute__((noreturn)) void l2arc_dev_rebuild_thread(void *arg); static int l2arc_rebuild(l2arc_dev_t *dev); /* L2ARC persistence read I/O routines. */ static int l2arc_dev_hdr_read(l2arc_dev_t *dev); static int l2arc_log_blk_read(l2arc_dev_t *dev, const l2arc_log_blkptr_t *this_lp, const l2arc_log_blkptr_t *next_lp, l2arc_log_blk_phys_t *this_lb, l2arc_log_blk_phys_t *next_lb, zio_t *this_io, zio_t **next_io); static zio_t *l2arc_log_blk_fetch(vdev_t *vd, const l2arc_log_blkptr_t *lp, l2arc_log_blk_phys_t *lb); static void l2arc_log_blk_fetch_abort(zio_t *zio); /* L2ARC persistence block restoration routines. */ static void l2arc_log_blk_restore(l2arc_dev_t *dev, const l2arc_log_blk_phys_t *lb, uint64_t lb_asize); static void l2arc_hdr_restore(const l2arc_log_ent_phys_t *le, l2arc_dev_t *dev); /* L2ARC persistence write I/O routines. */ static void l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb); /* L2ARC persistence auxiliary routines. */ boolean_t l2arc_log_blkptr_valid(l2arc_dev_t *dev, const l2arc_log_blkptr_t *lbp); static boolean_t l2arc_log_blk_insert(l2arc_dev_t *dev, const arc_buf_hdr_t *ab); boolean_t l2arc_range_check_overlap(uint64_t bottom, uint64_t top, uint64_t check); static void l2arc_blk_fetch_done(zio_t *zio); static inline uint64_t l2arc_log_blk_overhead(uint64_t write_sz, l2arc_dev_t *dev); /* * We use Cityhash for this. It's fast, and has good hash properties without * requiring any large static buffers. */ static uint64_t buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth) { return (cityhash4(spa, dva->dva_word[0], dva->dva_word[1], birth)); } #define HDR_EMPTY(hdr) \ ((hdr)->b_dva.dva_word[0] == 0 && \ (hdr)->b_dva.dva_word[1] == 0) #define HDR_EMPTY_OR_LOCKED(hdr) \ (HDR_EMPTY(hdr) || MUTEX_HELD(HDR_LOCK(hdr))) #define HDR_EQUAL(spa, dva, birth, hdr) \ ((hdr)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ ((hdr)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ ((hdr)->b_birth == birth) && ((hdr)->b_spa == spa) static void buf_discard_identity(arc_buf_hdr_t *hdr) { hdr->b_dva.dva_word[0] = 0; hdr->b_dva.dva_word[1] = 0; hdr->b_birth = 0; } static arc_buf_hdr_t * buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp) { const dva_t *dva = BP_IDENTITY(bp); uint64_t birth = BP_PHYSICAL_BIRTH(bp); uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); kmutex_t *hash_lock = BUF_HASH_LOCK(idx); arc_buf_hdr_t *hdr; mutex_enter(hash_lock); for (hdr = buf_hash_table.ht_table[idx]; hdr != NULL; hdr = hdr->b_hash_next) { if (HDR_EQUAL(spa, dva, birth, hdr)) { *lockp = hash_lock; return (hdr); } } mutex_exit(hash_lock); *lockp = NULL; return (NULL); } /* * Insert an entry into the hash table. If there is already an element * equal to elem in the hash table, then the already existing element * will be returned and the new element will not be inserted. * Otherwise returns NULL. * If lockp == NULL, the caller is assumed to already hold the hash lock. */ static arc_buf_hdr_t * buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp) { uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth); kmutex_t *hash_lock = BUF_HASH_LOCK(idx); arc_buf_hdr_t *fhdr; uint32_t i; ASSERT(!DVA_IS_EMPTY(&hdr->b_dva)); ASSERT(hdr->b_birth != 0); ASSERT(!HDR_IN_HASH_TABLE(hdr)); if (lockp != NULL) { *lockp = hash_lock; mutex_enter(hash_lock); } else { ASSERT(MUTEX_HELD(hash_lock)); } for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL; fhdr = fhdr->b_hash_next, i++) { if (HDR_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr)) return (fhdr); } hdr->b_hash_next = buf_hash_table.ht_table[idx]; buf_hash_table.ht_table[idx] = hdr; arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE); /* collect some hash table performance data */ if (i > 0) { ARCSTAT_BUMP(arcstat_hash_collisions); if (i == 1) ARCSTAT_BUMP(arcstat_hash_chains); ARCSTAT_MAX(arcstat_hash_chain_max, i); } uint64_t he = atomic_inc_64_nv( &arc_stats.arcstat_hash_elements.value.ui64); ARCSTAT_MAX(arcstat_hash_elements_max, he); return (NULL); } static void buf_hash_remove(arc_buf_hdr_t *hdr) { arc_buf_hdr_t *fhdr, **hdrp; uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth); ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx))); ASSERT(HDR_IN_HASH_TABLE(hdr)); hdrp = &buf_hash_table.ht_table[idx]; while ((fhdr = *hdrp) != hdr) { ASSERT3P(fhdr, !=, NULL); hdrp = &fhdr->b_hash_next; } *hdrp = hdr->b_hash_next; hdr->b_hash_next = NULL; arc_hdr_clear_flags(hdr, ARC_FLAG_IN_HASH_TABLE); /* collect some hash table performance data */ atomic_dec_64(&arc_stats.arcstat_hash_elements.value.ui64); if (buf_hash_table.ht_table[idx] && buf_hash_table.ht_table[idx]->b_hash_next == NULL) ARCSTAT_BUMPDOWN(arcstat_hash_chains); } /* * Global data structures and functions for the buf kmem cache. */ static kmem_cache_t *hdr_full_cache; static kmem_cache_t *hdr_full_crypt_cache; static kmem_cache_t *hdr_l2only_cache; static kmem_cache_t *buf_cache; static void buf_fini(void) { #if defined(_KERNEL) /* * Large allocations which do not require contiguous pages * should be using vmem_free() in the linux kernel\ */ vmem_free(buf_hash_table.ht_table, (buf_hash_table.ht_mask + 1) * sizeof (void *)); #else kmem_free(buf_hash_table.ht_table, (buf_hash_table.ht_mask + 1) * sizeof (void *)); #endif for (int i = 0; i < BUF_LOCKS; i++) mutex_destroy(BUF_HASH_LOCK(i)); kmem_cache_destroy(hdr_full_cache); kmem_cache_destroy(hdr_full_crypt_cache); kmem_cache_destroy(hdr_l2only_cache); kmem_cache_destroy(buf_cache); } /* * Constructor callback - called when the cache is empty * and a new buf is requested. */ static int hdr_full_cons(void *vbuf, void *unused, int kmflag) { (void) unused, (void) kmflag; arc_buf_hdr_t *hdr = vbuf; memset(hdr, 0, HDR_FULL_SIZE); hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL); zfs_refcount_create(&hdr->b_l1hdr.b_refcnt); mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); list_link_init(&hdr->b_l1hdr.b_arc_node); list_link_init(&hdr->b_l2hdr.b_l2node); multilist_link_init(&hdr->b_l1hdr.b_arc_node); arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS); return (0); } static int hdr_full_crypt_cons(void *vbuf, void *unused, int kmflag) { (void) unused; arc_buf_hdr_t *hdr = vbuf; hdr_full_cons(vbuf, unused, kmflag); memset(&hdr->b_crypt_hdr, 0, sizeof (hdr->b_crypt_hdr)); arc_space_consume(sizeof (hdr->b_crypt_hdr), ARC_SPACE_HDRS); return (0); } static int hdr_l2only_cons(void *vbuf, void *unused, int kmflag) { (void) unused, (void) kmflag; arc_buf_hdr_t *hdr = vbuf; memset(hdr, 0, HDR_L2ONLY_SIZE); arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); return (0); } static int buf_cons(void *vbuf, void *unused, int kmflag) { (void) unused, (void) kmflag; arc_buf_t *buf = vbuf; memset(buf, 0, sizeof (arc_buf_t)); mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL); arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS); return (0); } /* * Destructor callback - called when a cached buf is * no longer required. */ static void hdr_full_dest(void *vbuf, void *unused) { (void) unused; arc_buf_hdr_t *hdr = vbuf; ASSERT(HDR_EMPTY(hdr)); cv_destroy(&hdr->b_l1hdr.b_cv); zfs_refcount_destroy(&hdr->b_l1hdr.b_refcnt); mutex_destroy(&hdr->b_l1hdr.b_freeze_lock); ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS); } static void hdr_full_crypt_dest(void *vbuf, void *unused) { (void) vbuf, (void) unused; hdr_full_dest(vbuf, unused); arc_space_return(sizeof (((arc_buf_hdr_t *)NULL)->b_crypt_hdr), ARC_SPACE_HDRS); } static void hdr_l2only_dest(void *vbuf, void *unused) { (void) unused; arc_buf_hdr_t *hdr = vbuf; ASSERT(HDR_EMPTY(hdr)); arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); } static void buf_dest(void *vbuf, void *unused) { (void) unused; arc_buf_t *buf = vbuf; mutex_destroy(&buf->b_evict_lock); arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS); } static void buf_init(void) { uint64_t *ct = NULL; uint64_t hsize = 1ULL << 12; int i, j; /* * The hash table is big enough to fill all of physical memory * with an average block size of zfs_arc_average_blocksize (default 8K). * By default, the table will take up * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers). */ while (hsize * zfs_arc_average_blocksize < arc_all_memory()) hsize <<= 1; retry: buf_hash_table.ht_mask = hsize - 1; #if defined(_KERNEL) /* * Large allocations which do not require contiguous pages * should be using vmem_alloc() in the linux kernel */ buf_hash_table.ht_table = vmem_zalloc(hsize * sizeof (void*), KM_SLEEP); #else buf_hash_table.ht_table = kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP); #endif if (buf_hash_table.ht_table == NULL) { ASSERT(hsize > (1ULL << 8)); hsize >>= 1; goto retry; } hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE, 0, hdr_full_cons, hdr_full_dest, NULL, NULL, NULL, 0); hdr_full_crypt_cache = kmem_cache_create("arc_buf_hdr_t_full_crypt", HDR_FULL_CRYPT_SIZE, 0, hdr_full_crypt_cons, hdr_full_crypt_dest, NULL, NULL, NULL, 0); hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only", HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, NULL, NULL, NULL, 0); buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t), 0, buf_cons, buf_dest, NULL, NULL, NULL, 0); for (i = 0; i < 256; i++) for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); for (i = 0; i < BUF_LOCKS; i++) mutex_init(BUF_HASH_LOCK(i), NULL, MUTEX_DEFAULT, NULL); } #define ARC_MINTIME (hz>>4) /* 62 ms */ /* * This is the size that the buf occupies in memory. If the buf is compressed, * it will correspond to the compressed size. You should use this method of * getting the buf size unless you explicitly need the logical size. */ uint64_t arc_buf_size(arc_buf_t *buf) { return (ARC_BUF_COMPRESSED(buf) ? HDR_GET_PSIZE(buf->b_hdr) : HDR_GET_LSIZE(buf->b_hdr)); } uint64_t arc_buf_lsize(arc_buf_t *buf) { return (HDR_GET_LSIZE(buf->b_hdr)); } /* * This function will return B_TRUE if the buffer is encrypted in memory. * This buffer can be decrypted by calling arc_untransform(). */ boolean_t arc_is_encrypted(arc_buf_t *buf) { return (ARC_BUF_ENCRYPTED(buf) != 0); } /* * Returns B_TRUE if the buffer represents data that has not had its MAC * verified yet. */ boolean_t arc_is_unauthenticated(arc_buf_t *buf) { return (HDR_NOAUTH(buf->b_hdr) != 0); } void arc_get_raw_params(arc_buf_t *buf, boolean_t *byteorder, uint8_t *salt, uint8_t *iv, uint8_t *mac) { arc_buf_hdr_t *hdr = buf->b_hdr; ASSERT(HDR_PROTECTED(hdr)); memcpy(salt, hdr->b_crypt_hdr.b_salt, ZIO_DATA_SALT_LEN); memcpy(iv, hdr->b_crypt_hdr.b_iv, ZIO_DATA_IV_LEN); memcpy(mac, hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN); *byteorder = (hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS) ? ZFS_HOST_BYTEORDER : !ZFS_HOST_BYTEORDER; } /* * Indicates how this buffer is compressed in memory. If it is not compressed * the value will be ZIO_COMPRESS_OFF. It can be made normally readable with * arc_untransform() as long as it is also unencrypted. */ enum zio_compress arc_get_compression(arc_buf_t *buf) { return (ARC_BUF_COMPRESSED(buf) ? HDR_GET_COMPRESS(buf->b_hdr) : ZIO_COMPRESS_OFF); } /* * Return the compression algorithm used to store this data in the ARC. If ARC * compression is enabled or this is an encrypted block, this will be the same * as what's used to store it on-disk. Otherwise, this will be ZIO_COMPRESS_OFF. */ static inline enum zio_compress arc_hdr_get_compress(arc_buf_hdr_t *hdr) { return (HDR_COMPRESSION_ENABLED(hdr) ? HDR_GET_COMPRESS(hdr) : ZIO_COMPRESS_OFF); } uint8_t arc_get_complevel(arc_buf_t *buf) { return (buf->b_hdr->b_complevel); } static inline boolean_t arc_buf_is_shared(arc_buf_t *buf) { boolean_t shared = (buf->b_data != NULL && buf->b_hdr->b_l1hdr.b_pabd != NULL && abd_is_linear(buf->b_hdr->b_l1hdr.b_pabd) && buf->b_data == abd_to_buf(buf->b_hdr->b_l1hdr.b_pabd)); IMPLY(shared, HDR_SHARED_DATA(buf->b_hdr)); IMPLY(shared, ARC_BUF_SHARED(buf)); IMPLY(shared, ARC_BUF_COMPRESSED(buf) || ARC_BUF_LAST(buf)); /* * It would be nice to assert arc_can_share() too, but the "hdr isn't * already being shared" requirement prevents us from doing that. */ return (shared); } /* * Free the checksum associated with this header. If there is no checksum, this * is a no-op. */ static inline void arc_cksum_free(arc_buf_hdr_t *hdr) { ASSERT(HDR_HAS_L1HDR(hdr)); mutex_enter(&hdr->b_l1hdr.b_freeze_lock); if (hdr->b_l1hdr.b_freeze_cksum != NULL) { kmem_free(hdr->b_l1hdr.b_freeze_cksum, sizeof (zio_cksum_t)); hdr->b_l1hdr.b_freeze_cksum = NULL; } mutex_exit(&hdr->b_l1hdr.b_freeze_lock); } /* * Return true iff at least one of the bufs on hdr is not compressed. * Encrypted buffers count as compressed. */ static boolean_t arc_hdr_has_uncompressed_buf(arc_buf_hdr_t *hdr) { ASSERT(hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY_OR_LOCKED(hdr)); for (arc_buf_t *b = hdr->b_l1hdr.b_buf; b != NULL; b = b->b_next) { if (!ARC_BUF_COMPRESSED(b)) { return (B_TRUE); } } return (B_FALSE); } /* * If we've turned on the ZFS_DEBUG_MODIFY flag, verify that the buf's data * matches the checksum that is stored in the hdr. If there is no checksum, * or if the buf is compressed, this is a no-op. */ static void arc_cksum_verify(arc_buf_t *buf) { arc_buf_hdr_t *hdr = buf->b_hdr; zio_cksum_t zc; if (!(zfs_flags & ZFS_DEBUG_MODIFY)) return; if (ARC_BUF_COMPRESSED(buf)) return; ASSERT(HDR_HAS_L1HDR(hdr)); mutex_enter(&hdr->b_l1hdr.b_freeze_lock); if (hdr->b_l1hdr.b_freeze_cksum == NULL || HDR_IO_ERROR(hdr)) { mutex_exit(&hdr->b_l1hdr.b_freeze_lock); return; } fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL, &zc); if (!ZIO_CHECKSUM_EQUAL(*hdr->b_l1hdr.b_freeze_cksum, zc)) panic("buffer modified while frozen!"); mutex_exit(&hdr->b_l1hdr.b_freeze_lock); } /* * This function makes the assumption that data stored in the L2ARC * will be transformed exactly as it is in the main pool. Because of * this we can verify the checksum against the reading process's bp. */ static boolean_t arc_cksum_is_equal(arc_buf_hdr_t *hdr, zio_t *zio) { ASSERT(!BP_IS_EMBEDDED(zio->io_bp)); VERIFY3U(BP_GET_PSIZE(zio->io_bp), ==, HDR_GET_PSIZE(hdr)); /* * Block pointers always store the checksum for the logical data. * If the block pointer has the gang bit set, then the checksum * it represents is for the reconstituted data and not for an * individual gang member. The zio pipeline, however, must be able to * determine the checksum of each of the gang constituents so it * treats the checksum comparison differently than what we need * for l2arc blocks. This prevents us from using the * zio_checksum_error() interface directly. Instead we must call the * zio_checksum_error_impl() so that we can ensure the checksum is * generated using the correct checksum algorithm and accounts for the * logical I/O size and not just a gang fragment. */ return (zio_checksum_error_impl(zio->io_spa, zio->io_bp, BP_GET_CHECKSUM(zio->io_bp), zio->io_abd, zio->io_size, zio->io_offset, NULL) == 0); } /* * Given a buf full of data, if ZFS_DEBUG_MODIFY is enabled this computes a * checksum and attaches it to the buf's hdr so that we can ensure that the buf * isn't modified later on. If buf is compressed or there is already a checksum * on the hdr, this is a no-op (we only checksum uncompressed bufs). */ static void arc_cksum_compute(arc_buf_t *buf) { arc_buf_hdr_t *hdr = buf->b_hdr; if (!(zfs_flags & ZFS_DEBUG_MODIFY)) return; ASSERT(HDR_HAS_L1HDR(hdr)); mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); if (hdr->b_l1hdr.b_freeze_cksum != NULL || ARC_BUF_COMPRESSED(buf)) { mutex_exit(&hdr->b_l1hdr.b_freeze_lock); return; } ASSERT(!ARC_BUF_ENCRYPTED(buf)); ASSERT(!ARC_BUF_COMPRESSED(buf)); hdr->b_l1hdr.b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP); fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL, hdr->b_l1hdr.b_freeze_cksum); mutex_exit(&hdr->b_l1hdr.b_freeze_lock); arc_buf_watch(buf); } #ifndef _KERNEL void arc_buf_sigsegv(int sig, siginfo_t *si, void *unused) { (void) sig, (void) unused; panic("Got SIGSEGV at address: 0x%lx\n", (long)si->si_addr); } #endif static void arc_buf_unwatch(arc_buf_t *buf) { #ifndef _KERNEL if (arc_watch) { ASSERT0(mprotect(buf->b_data, arc_buf_size(buf), PROT_READ | PROT_WRITE)); } #else (void) buf; #endif } static void arc_buf_watch(arc_buf_t *buf) { #ifndef _KERNEL if (arc_watch) ASSERT0(mprotect(buf->b_data, arc_buf_size(buf), PROT_READ)); #else (void) buf; #endif } static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *hdr) { arc_buf_contents_t type; if (HDR_ISTYPE_METADATA(hdr)) { type = ARC_BUFC_METADATA; } else { type = ARC_BUFC_DATA; } VERIFY3U(hdr->b_type, ==, type); return (type); } boolean_t arc_is_metadata(arc_buf_t *buf) { return (HDR_ISTYPE_METADATA(buf->b_hdr) != 0); } static uint32_t arc_bufc_to_flags(arc_buf_contents_t type) { switch (type) { case ARC_BUFC_DATA: /* metadata field is 0 if buffer contains normal data */ return (0); case ARC_BUFC_METADATA: return (ARC_FLAG_BUFC_METADATA); default: break; } panic("undefined ARC buffer type!"); return ((uint32_t)-1); } void arc_buf_thaw(arc_buf_t *buf) { arc_buf_hdr_t *hdr = buf->b_hdr; ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); arc_cksum_verify(buf); /* * Compressed buffers do not manipulate the b_freeze_cksum. */ if (ARC_BUF_COMPRESSED(buf)) return; ASSERT(HDR_HAS_L1HDR(hdr)); arc_cksum_free(hdr); arc_buf_unwatch(buf); } void arc_buf_freeze(arc_buf_t *buf) { if (!(zfs_flags & ZFS_DEBUG_MODIFY)) return; if (ARC_BUF_COMPRESSED(buf)) return; ASSERT(HDR_HAS_L1HDR(buf->b_hdr)); arc_cksum_compute(buf); } /* * The arc_buf_hdr_t's b_flags should never be modified directly. Instead, * the following functions should be used to ensure that the flags are * updated in a thread-safe way. When manipulating the flags either * the hash_lock must be held or the hdr must be undiscoverable. This * ensures that we're not racing with any other threads when updating * the flags. */ static inline void arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags) { ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); hdr->b_flags |= flags; } static inline void arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags) { ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); hdr->b_flags &= ~flags; } /* * Setting the compression bits in the arc_buf_hdr_t's b_flags is * done in a special way since we have to clear and set bits * at the same time. Consumers that wish to set the compression bits * must use this function to ensure that the flags are updated in * thread-safe manner. */ static void arc_hdr_set_compress(arc_buf_hdr_t *hdr, enum zio_compress cmp) { ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); /* * Holes and embedded blocks will always have a psize = 0 so * we ignore the compression of the blkptr and set the * want to uncompress them. Mark them as uncompressed. */ if (!zfs_compressed_arc_enabled || HDR_GET_PSIZE(hdr) == 0) { arc_hdr_clear_flags(hdr, ARC_FLAG_COMPRESSED_ARC); ASSERT(!HDR_COMPRESSION_ENABLED(hdr)); } else { arc_hdr_set_flags(hdr, ARC_FLAG_COMPRESSED_ARC); ASSERT(HDR_COMPRESSION_ENABLED(hdr)); } HDR_SET_COMPRESS(hdr, cmp); ASSERT3U(HDR_GET_COMPRESS(hdr), ==, cmp); } /* * Looks for another buf on the same hdr which has the data decompressed, copies * from it, and returns true. If no such buf exists, returns false. */ static boolean_t arc_buf_try_copy_decompressed_data(arc_buf_t *buf) { arc_buf_hdr_t *hdr = buf->b_hdr; boolean_t copied = B_FALSE; ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT3P(buf->b_data, !=, NULL); ASSERT(!ARC_BUF_COMPRESSED(buf)); for (arc_buf_t *from = hdr->b_l1hdr.b_buf; from != NULL; from = from->b_next) { /* can't use our own data buffer */ if (from == buf) { continue; } if (!ARC_BUF_COMPRESSED(from)) { memcpy(buf->b_data, from->b_data, arc_buf_size(buf)); copied = B_TRUE; break; } } /* * There were no decompressed bufs, so there should not be a * checksum on the hdr either. */ if (zfs_flags & ZFS_DEBUG_MODIFY) EQUIV(!copied, hdr->b_l1hdr.b_freeze_cksum == NULL); return (copied); } /* * Allocates an ARC buf header that's in an evicted & L2-cached state. * This is used during l2arc reconstruction to make empty ARC buffers * which circumvent the regular disk->arc->l2arc path and instead come * into being in the reverse order, i.e. l2arc->arc. */ static arc_buf_hdr_t * arc_buf_alloc_l2only(size_t size, arc_buf_contents_t type, l2arc_dev_t *dev, dva_t dva, uint64_t daddr, int32_t psize, uint64_t birth, enum zio_compress compress, uint8_t complevel, boolean_t protected, boolean_t prefetch, arc_state_type_t arcs_state) { arc_buf_hdr_t *hdr; ASSERT(size != 0); hdr = kmem_cache_alloc(hdr_l2only_cache, KM_SLEEP); hdr->b_birth = birth; hdr->b_type = type; hdr->b_flags = 0; arc_hdr_set_flags(hdr, arc_bufc_to_flags(type) | ARC_FLAG_HAS_L2HDR); HDR_SET_LSIZE(hdr, size); HDR_SET_PSIZE(hdr, psize); arc_hdr_set_compress(hdr, compress); hdr->b_complevel = complevel; if (protected) arc_hdr_set_flags(hdr, ARC_FLAG_PROTECTED); if (prefetch) arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); hdr->b_spa = spa_load_guid(dev->l2ad_vdev->vdev_spa); hdr->b_dva = dva; hdr->b_l2hdr.b_dev = dev; hdr->b_l2hdr.b_daddr = daddr; hdr->b_l2hdr.b_arcs_state = arcs_state; return (hdr); } /* * Return the size of the block, b_pabd, that is stored in the arc_buf_hdr_t. */ static uint64_t arc_hdr_size(arc_buf_hdr_t *hdr) { uint64_t size; if (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF && HDR_GET_PSIZE(hdr) > 0) { size = HDR_GET_PSIZE(hdr); } else { ASSERT3U(HDR_GET_LSIZE(hdr), !=, 0); size = HDR_GET_LSIZE(hdr); } return (size); } static int arc_hdr_authenticate(arc_buf_hdr_t *hdr, spa_t *spa, uint64_t dsobj) { int ret; uint64_t csize; uint64_t lsize = HDR_GET_LSIZE(hdr); uint64_t psize = HDR_GET_PSIZE(hdr); void *tmpbuf = NULL; abd_t *abd = hdr->b_l1hdr.b_pabd; ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); ASSERT(HDR_AUTHENTICATED(hdr)); ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); /* * The MAC is calculated on the compressed data that is stored on disk. * However, if compressed arc is disabled we will only have the * decompressed data available to us now. Compress it into a temporary * abd so we can verify the MAC. The performance overhead of this will * be relatively low, since most objects in an encrypted objset will * be encrypted (instead of authenticated) anyway. */ if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && !HDR_COMPRESSION_ENABLED(hdr)) { tmpbuf = zio_buf_alloc(lsize); abd = abd_get_from_buf(tmpbuf, lsize); abd_take_ownership_of_buf(abd, B_TRUE); csize = zio_compress_data(HDR_GET_COMPRESS(hdr), hdr->b_l1hdr.b_pabd, tmpbuf, lsize, hdr->b_complevel); ASSERT3U(csize, <=, psize); abd_zero_off(abd, csize, psize - csize); } /* * Authentication is best effort. We authenticate whenever the key is * available. If we succeed we clear ARC_FLAG_NOAUTH. */ if (hdr->b_crypt_hdr.b_ot == DMU_OT_OBJSET) { ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF); ASSERT3U(lsize, ==, psize); ret = spa_do_crypt_objset_mac_abd(B_FALSE, spa, dsobj, abd, psize, hdr->b_l1hdr.b_byteswap != DMU_BSWAP_NUMFUNCS); } else { ret = spa_do_crypt_mac_abd(B_FALSE, spa, dsobj, abd, psize, hdr->b_crypt_hdr.b_mac); } if (ret == 0) arc_hdr_clear_flags(hdr, ARC_FLAG_NOAUTH); else if (ret != ENOENT) goto error; if (tmpbuf != NULL) abd_free(abd); return (0); error: if (tmpbuf != NULL) abd_free(abd); return (ret); } /* * This function will take a header that only has raw encrypted data in * b_crypt_hdr.b_rabd and decrypt it into a new buffer which is stored in * b_l1hdr.b_pabd. If designated in the header flags, this function will * also decompress the data. */ static int arc_hdr_decrypt(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb) { int ret; abd_t *cabd = NULL; void *tmp = NULL; boolean_t no_crypt = B_FALSE; boolean_t bswap = (hdr->b_l1hdr.b_byteswap != DMU_BSWAP_NUMFUNCS); ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); ASSERT(HDR_ENCRYPTED(hdr)); arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT); ret = spa_do_crypt_abd(B_FALSE, spa, zb, hdr->b_crypt_hdr.b_ot, B_FALSE, bswap, hdr->b_crypt_hdr.b_salt, hdr->b_crypt_hdr.b_iv, hdr->b_crypt_hdr.b_mac, HDR_GET_PSIZE(hdr), hdr->b_l1hdr.b_pabd, hdr->b_crypt_hdr.b_rabd, &no_crypt); if (ret != 0) goto error; if (no_crypt) { abd_copy(hdr->b_l1hdr.b_pabd, hdr->b_crypt_hdr.b_rabd, HDR_GET_PSIZE(hdr)); } /* * If this header has disabled arc compression but the b_pabd is * compressed after decrypting it, we need to decompress the newly * decrypted data. */ if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && !HDR_COMPRESSION_ENABLED(hdr)) { /* * We want to make sure that we are correctly honoring the * zfs_abd_scatter_enabled setting, so we allocate an abd here * and then loan a buffer from it, rather than allocating a * linear buffer and wrapping it in an abd later. */ cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr, ARC_HDR_DO_ADAPT); tmp = abd_borrow_buf(cabd, arc_hdr_size(hdr)); ret = zio_decompress_data(HDR_GET_COMPRESS(hdr), hdr->b_l1hdr.b_pabd, tmp, HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr), &hdr->b_complevel); if (ret != 0) { abd_return_buf(cabd, tmp, arc_hdr_size(hdr)); goto error; } abd_return_buf_copy(cabd, tmp, arc_hdr_size(hdr)); arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd, arc_hdr_size(hdr), hdr); hdr->b_l1hdr.b_pabd = cabd; } return (0); error: arc_hdr_free_abd(hdr, B_FALSE); if (cabd != NULL) arc_free_data_buf(hdr, cabd, arc_hdr_size(hdr), hdr); return (ret); } /* * This function is called during arc_buf_fill() to prepare the header's * abd plaintext pointer for use. This involves authenticated protected * data and decrypting encrypted data into the plaintext abd. */ static int arc_fill_hdr_crypt(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, spa_t *spa, const zbookmark_phys_t *zb, boolean_t noauth) { int ret; ASSERT(HDR_PROTECTED(hdr)); if (hash_lock != NULL) mutex_enter(hash_lock); if (HDR_NOAUTH(hdr) && !noauth) { /* * The caller requested authenticated data but our data has * not been authenticated yet. Verify the MAC now if we can. */ ret = arc_hdr_authenticate(hdr, spa, zb->zb_objset); if (ret != 0) goto error; } else if (HDR_HAS_RABD(hdr) && hdr->b_l1hdr.b_pabd == NULL) { /* * If we only have the encrypted version of the data, but the * unencrypted version was requested we take this opportunity * to store the decrypted version in the header for future use. */ ret = arc_hdr_decrypt(hdr, spa, zb); if (ret != 0) goto error; } ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); if (hash_lock != NULL) mutex_exit(hash_lock); return (0); error: if (hash_lock != NULL) mutex_exit(hash_lock); return (ret); } /* * This function is used by the dbuf code to decrypt bonus buffers in place. * The dbuf code itself doesn't have any locking for decrypting a shared dnode * block, so we use the hash lock here to protect against concurrent calls to * arc_buf_fill(). */ static void arc_buf_untransform_in_place(arc_buf_t *buf) { arc_buf_hdr_t *hdr = buf->b_hdr; ASSERT(HDR_ENCRYPTED(hdr)); ASSERT3U(hdr->b_crypt_hdr.b_ot, ==, DMU_OT_DNODE); ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); zio_crypt_copy_dnode_bonus(hdr->b_l1hdr.b_pabd, buf->b_data, arc_buf_size(buf)); buf->b_flags &= ~ARC_BUF_FLAG_ENCRYPTED; buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED; hdr->b_crypt_hdr.b_ebufcnt -= 1; } /* * Given a buf that has a data buffer attached to it, this function will * efficiently fill the buf with data of the specified compression setting from * the hdr and update the hdr's b_freeze_cksum if necessary. If the buf and hdr * are already sharing a data buf, no copy is performed. * * If the buf is marked as compressed but uncompressed data was requested, this * will allocate a new data buffer for the buf, remove that flag, and fill the * buf with uncompressed data. You can't request a compressed buf on a hdr with * uncompressed data, and (since we haven't added support for it yet) if you * want compressed data your buf must already be marked as compressed and have * the correct-sized data buffer. */ static int arc_buf_fill(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb, arc_fill_flags_t flags) { int error = 0; arc_buf_hdr_t *hdr = buf->b_hdr; boolean_t hdr_compressed = (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF); boolean_t compressed = (flags & ARC_FILL_COMPRESSED) != 0; boolean_t encrypted = (flags & ARC_FILL_ENCRYPTED) != 0; dmu_object_byteswap_t bswap = hdr->b_l1hdr.b_byteswap; kmutex_t *hash_lock = (flags & ARC_FILL_LOCKED) ? NULL : HDR_LOCK(hdr); ASSERT3P(buf->b_data, !=, NULL); IMPLY(compressed, hdr_compressed || ARC_BUF_ENCRYPTED(buf)); IMPLY(compressed, ARC_BUF_COMPRESSED(buf)); IMPLY(encrypted, HDR_ENCRYPTED(hdr)); IMPLY(encrypted, ARC_BUF_ENCRYPTED(buf)); IMPLY(encrypted, ARC_BUF_COMPRESSED(buf)); IMPLY(encrypted, !ARC_BUF_SHARED(buf)); /* * If the caller wanted encrypted data we just need to copy it from * b_rabd and potentially byteswap it. We won't be able to do any * further transforms on it. */ if (encrypted) { ASSERT(HDR_HAS_RABD(hdr)); abd_copy_to_buf(buf->b_data, hdr->b_crypt_hdr.b_rabd, HDR_GET_PSIZE(hdr)); goto byteswap; } /* * Adjust encrypted and authenticated headers to accommodate * the request if needed. Dnode blocks (ARC_FILL_IN_PLACE) are * allowed to fail decryption due to keys not being loaded * without being marked as an IO error. */ if (HDR_PROTECTED(hdr)) { error = arc_fill_hdr_crypt(hdr, hash_lock, spa, zb, !!(flags & ARC_FILL_NOAUTH)); if (error == EACCES && (flags & ARC_FILL_IN_PLACE) != 0) { return (error); } else if (error != 0) { if (hash_lock != NULL) mutex_enter(hash_lock); arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR); if (hash_lock != NULL) mutex_exit(hash_lock); return (error); } } /* * There is a special case here for dnode blocks which are * decrypting their bonus buffers. These blocks may request to * be decrypted in-place. This is necessary because there may * be many dnodes pointing into this buffer and there is * currently no method to synchronize replacing the backing * b_data buffer and updating all of the pointers. Here we use * the hash lock to ensure there are no races. If the need * arises for other types to be decrypted in-place, they must * add handling here as well. */ if ((flags & ARC_FILL_IN_PLACE) != 0) { ASSERT(!hdr_compressed); ASSERT(!compressed); ASSERT(!encrypted); if (HDR_ENCRYPTED(hdr) && ARC_BUF_ENCRYPTED(buf)) { ASSERT3U(hdr->b_crypt_hdr.b_ot, ==, DMU_OT_DNODE); if (hash_lock != NULL) mutex_enter(hash_lock); arc_buf_untransform_in_place(buf); if (hash_lock != NULL) mutex_exit(hash_lock); /* Compute the hdr's checksum if necessary */ arc_cksum_compute(buf); } return (0); } if (hdr_compressed == compressed) { if (!arc_buf_is_shared(buf)) { abd_copy_to_buf(buf->b_data, hdr->b_l1hdr.b_pabd, arc_buf_size(buf)); } } else { ASSERT(hdr_compressed); ASSERT(!compressed); /* * If the buf is sharing its data with the hdr, unlink it and * allocate a new data buffer for the buf. */ if (arc_buf_is_shared(buf)) { ASSERT(ARC_BUF_COMPRESSED(buf)); /* We need to give the buf its own b_data */ buf->b_flags &= ~ARC_BUF_FLAG_SHARED; buf->b_data = arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf); arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); /* Previously overhead was 0; just add new overhead */ ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr)); } else if (ARC_BUF_COMPRESSED(buf)) { /* We need to reallocate the buf's b_data */ arc_free_data_buf(hdr, buf->b_data, HDR_GET_PSIZE(hdr), buf); buf->b_data = arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf); /* We increased the size of b_data; update overhead */ ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr) - HDR_GET_PSIZE(hdr)); } /* * Regardless of the buf's previous compression settings, it * should not be compressed at the end of this function. */ buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED; /* * Try copying the data from another buf which already has a * decompressed version. If that's not possible, it's time to * bite the bullet and decompress the data from the hdr. */ if (arc_buf_try_copy_decompressed_data(buf)) { /* Skip byteswapping and checksumming (already done) */ return (0); } else { error = zio_decompress_data(HDR_GET_COMPRESS(hdr), hdr->b_l1hdr.b_pabd, buf->b_data, HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr), &hdr->b_complevel); /* * Absent hardware errors or software bugs, this should * be impossible, but log it anyway so we can debug it. */ if (error != 0) { zfs_dbgmsg( "hdr %px, compress %d, psize %d, lsize %d", hdr, arc_hdr_get_compress(hdr), HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr)); if (hash_lock != NULL) mutex_enter(hash_lock); arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR); if (hash_lock != NULL) mutex_exit(hash_lock); return (SET_ERROR(EIO)); } } } byteswap: /* Byteswap the buf's data if necessary */ if (bswap != DMU_BSWAP_NUMFUNCS) { ASSERT(!HDR_SHARED_DATA(hdr)); ASSERT3U(bswap, <, DMU_BSWAP_NUMFUNCS); dmu_ot_byteswap[bswap].ob_func(buf->b_data, HDR_GET_LSIZE(hdr)); } /* Compute the hdr's checksum if necessary */ arc_cksum_compute(buf); return (0); } /* * If this function is being called to decrypt an encrypted buffer or verify an * authenticated one, the key must be loaded and a mapping must be made * available in the keystore via spa_keystore_create_mapping() or one of its * callers. */ int arc_untransform(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb, boolean_t in_place) { int ret; arc_fill_flags_t flags = 0; if (in_place) flags |= ARC_FILL_IN_PLACE; ret = arc_buf_fill(buf, spa, zb, flags); if (ret == ECKSUM) { /* * Convert authentication and decryption errors to EIO * (and generate an ereport) before leaving the ARC. */ ret = SET_ERROR(EIO); spa_log_error(spa, zb); (void) zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION, spa, NULL, zb, NULL, 0); } return (ret); } /* * Increment the amount of evictable space in the arc_state_t's refcount. * We account for the space used by the hdr and the arc buf individually * so that we can add and remove them from the refcount individually. */ static void arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state) { arc_buf_contents_t type = arc_buf_type(hdr); ASSERT(HDR_HAS_L1HDR(hdr)); if (GHOST_STATE(state)) { ASSERT0(hdr->b_l1hdr.b_bufcnt); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(!HDR_HAS_RABD(hdr)); (void) zfs_refcount_add_many(&state->arcs_esize[type], HDR_GET_LSIZE(hdr), hdr); return; } if (hdr->b_l1hdr.b_pabd != NULL) { (void) zfs_refcount_add_many(&state->arcs_esize[type], arc_hdr_size(hdr), hdr); } if (HDR_HAS_RABD(hdr)) { (void) zfs_refcount_add_many(&state->arcs_esize[type], HDR_GET_PSIZE(hdr), hdr); } for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) { if (arc_buf_is_shared(buf)) continue; (void) zfs_refcount_add_many(&state->arcs_esize[type], arc_buf_size(buf), buf); } } /* * Decrement the amount of evictable space in the arc_state_t's refcount. * We account for the space used by the hdr and the arc buf individually * so that we can add and remove them from the refcount individually. */ static void arc_evictable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state) { arc_buf_contents_t type = arc_buf_type(hdr); ASSERT(HDR_HAS_L1HDR(hdr)); if (GHOST_STATE(state)) { ASSERT0(hdr->b_l1hdr.b_bufcnt); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(!HDR_HAS_RABD(hdr)); (void) zfs_refcount_remove_many(&state->arcs_esize[type], HDR_GET_LSIZE(hdr), hdr); return; } if (hdr->b_l1hdr.b_pabd != NULL) { (void) zfs_refcount_remove_many(&state->arcs_esize[type], arc_hdr_size(hdr), hdr); } if (HDR_HAS_RABD(hdr)) { (void) zfs_refcount_remove_many(&state->arcs_esize[type], HDR_GET_PSIZE(hdr), hdr); } for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) { if (arc_buf_is_shared(buf)) continue; (void) zfs_refcount_remove_many(&state->arcs_esize[type], arc_buf_size(buf), buf); } } /* * Add a reference to this hdr indicating that someone is actively * referencing that memory. When the refcount transitions from 0 to 1, * we remove it from the respective arc_state_t list to indicate that * it is not evictable. */ static void add_reference(arc_buf_hdr_t *hdr, const void *tag) { arc_state_t *state; ASSERT(HDR_HAS_L1HDR(hdr)); if (!HDR_EMPTY(hdr) && !MUTEX_HELD(HDR_LOCK(hdr))) { ASSERT(hdr->b_l1hdr.b_state == arc_anon); ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); } state = hdr->b_l1hdr.b_state; if ((zfs_refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) && (state != arc_anon)) { /* We don't use the L2-only state list. */ if (state != arc_l2c_only) { multilist_remove(&state->arcs_list[arc_buf_type(hdr)], hdr); arc_evictable_space_decrement(hdr, state); } /* remove the prefetch flag if we get a reference */ if (HDR_HAS_L2HDR(hdr)) l2arc_hdr_arcstats_decrement_state(hdr); arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH); if (HDR_HAS_L2HDR(hdr)) l2arc_hdr_arcstats_increment_state(hdr); } } /* * Remove a reference from this hdr. When the reference transitions from * 1 to 0 and we're not anonymous, then we add this hdr to the arc_state_t's * list making it eligible for eviction. */ static int remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, const void *tag) { int cnt; arc_state_t *state = hdr->b_l1hdr.b_state; ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(state == arc_anon || MUTEX_HELD(hash_lock)); ASSERT(!GHOST_STATE(state)); /* * arc_l2c_only counts as a ghost state so we don't need to explicitly * check to prevent usage of the arc_l2c_only list. */ if (((cnt = zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) && (state != arc_anon)) { multilist_insert(&state->arcs_list[arc_buf_type(hdr)], hdr); ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0); arc_evictable_space_increment(hdr, state); } return (cnt); } /* * Returns detailed information about a specific arc buffer. When the * state_index argument is set the function will calculate the arc header * list position for its arc state. Since this requires a linear traversal * callers are strongly encourage not to do this. However, it can be helpful * for targeted analysis so the functionality is provided. */ void arc_buf_info(arc_buf_t *ab, arc_buf_info_t *abi, int state_index) { (void) state_index; arc_buf_hdr_t *hdr = ab->b_hdr; l1arc_buf_hdr_t *l1hdr = NULL; l2arc_buf_hdr_t *l2hdr = NULL; arc_state_t *state = NULL; memset(abi, 0, sizeof (arc_buf_info_t)); if (hdr == NULL) return; abi->abi_flags = hdr->b_flags; if (HDR_HAS_L1HDR(hdr)) { l1hdr = &hdr->b_l1hdr; state = l1hdr->b_state; } if (HDR_HAS_L2HDR(hdr)) l2hdr = &hdr->b_l2hdr; if (l1hdr) { abi->abi_bufcnt = l1hdr->b_bufcnt; abi->abi_access = l1hdr->b_arc_access; abi->abi_mru_hits = l1hdr->b_mru_hits; abi->abi_mru_ghost_hits = l1hdr->b_mru_ghost_hits; abi->abi_mfu_hits = l1hdr->b_mfu_hits; abi->abi_mfu_ghost_hits = l1hdr->b_mfu_ghost_hits; abi->abi_holds = zfs_refcount_count(&l1hdr->b_refcnt); } if (l2hdr) { abi->abi_l2arc_dattr = l2hdr->b_daddr; abi->abi_l2arc_hits = l2hdr->b_hits; } abi->abi_state_type = state ? state->arcs_state : ARC_STATE_ANON; abi->abi_state_contents = arc_buf_type(hdr); abi->abi_size = arc_hdr_size(hdr); } /* * Move the supplied buffer to the indicated state. The hash lock * for the buffer must be held by the caller. */ static void arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, kmutex_t *hash_lock) { arc_state_t *old_state; int64_t refcnt; uint32_t bufcnt; boolean_t update_old, update_new; arc_buf_contents_t buftype = arc_buf_type(hdr); /* * We almost always have an L1 hdr here, since we call arc_hdr_realloc() * in arc_read() when bringing a buffer out of the L2ARC. However, the * L1 hdr doesn't always exist when we change state to arc_anon before * destroying a header, in which case reallocating to add the L1 hdr is * pointless. */ if (HDR_HAS_L1HDR(hdr)) { old_state = hdr->b_l1hdr.b_state; refcnt = zfs_refcount_count(&hdr->b_l1hdr.b_refcnt); bufcnt = hdr->b_l1hdr.b_bufcnt; update_old = (bufcnt > 0 || hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr)); } else { old_state = arc_l2c_only; refcnt = 0; bufcnt = 0; update_old = B_FALSE; } update_new = update_old; ASSERT(MUTEX_HELD(hash_lock)); ASSERT3P(new_state, !=, old_state); ASSERT(!GHOST_STATE(new_state) || bufcnt == 0); ASSERT(old_state != arc_anon || bufcnt <= 1); /* * If this buffer is evictable, transfer it from the * old state list to the new state list. */ if (refcnt == 0) { if (old_state != arc_anon && old_state != arc_l2c_only) { ASSERT(HDR_HAS_L1HDR(hdr)); multilist_remove(&old_state->arcs_list[buftype], hdr); if (GHOST_STATE(old_state)) { ASSERT0(bufcnt); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); update_old = B_TRUE; } arc_evictable_space_decrement(hdr, old_state); } if (new_state != arc_anon && new_state != arc_l2c_only) { /* * An L1 header always exists here, since if we're * moving to some L1-cached state (i.e. not l2c_only or * anonymous), we realloc the header to add an L1hdr * beforehand. */ ASSERT(HDR_HAS_L1HDR(hdr)); multilist_insert(&new_state->arcs_list[buftype], hdr); if (GHOST_STATE(new_state)) { ASSERT0(bufcnt); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); update_new = B_TRUE; } arc_evictable_space_increment(hdr, new_state); } } ASSERT(!HDR_EMPTY(hdr)); if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr)) buf_hash_remove(hdr); /* adjust state sizes (ignore arc_l2c_only) */ if (update_new && new_state != arc_l2c_only) { ASSERT(HDR_HAS_L1HDR(hdr)); if (GHOST_STATE(new_state)) { ASSERT0(bufcnt); /* * When moving a header to a ghost state, we first * remove all arc buffers. Thus, we'll have a * bufcnt of zero, and no arc buffer to use for * the reference. As a result, we use the arc * header pointer for the reference. */ (void) zfs_refcount_add_many(&new_state->arcs_size, HDR_GET_LSIZE(hdr), hdr); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(!HDR_HAS_RABD(hdr)); } else { uint32_t buffers = 0; /* * Each individual buffer holds a unique reference, * thus we must remove each of these references one * at a time. */ for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) { ASSERT3U(bufcnt, !=, 0); buffers++; /* * When the arc_buf_t is sharing the data * block with the hdr, the owner of the * reference belongs to the hdr. Only * add to the refcount if the arc_buf_t is * not shared. */ if (arc_buf_is_shared(buf)) continue; (void) zfs_refcount_add_many( &new_state->arcs_size, arc_buf_size(buf), buf); } ASSERT3U(bufcnt, ==, buffers); if (hdr->b_l1hdr.b_pabd != NULL) { (void) zfs_refcount_add_many( &new_state->arcs_size, arc_hdr_size(hdr), hdr); } if (HDR_HAS_RABD(hdr)) { (void) zfs_refcount_add_many( &new_state->arcs_size, HDR_GET_PSIZE(hdr), hdr); } } } if (update_old && old_state != arc_l2c_only) { ASSERT(HDR_HAS_L1HDR(hdr)); if (GHOST_STATE(old_state)) { ASSERT0(bufcnt); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(!HDR_HAS_RABD(hdr)); /* * When moving a header off of a ghost state, * the header will not contain any arc buffers. * We use the arc header pointer for the reference * which is exactly what we did when we put the * header on the ghost state. */ (void) zfs_refcount_remove_many(&old_state->arcs_size, HDR_GET_LSIZE(hdr), hdr); } else { uint32_t buffers = 0; /* * Each individual buffer holds a unique reference, * thus we must remove each of these references one * at a time. */ for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) { ASSERT3U(bufcnt, !=, 0); buffers++; /* * When the arc_buf_t is sharing the data * block with the hdr, the owner of the * reference belongs to the hdr. Only * add to the refcount if the arc_buf_t is * not shared. */ if (arc_buf_is_shared(buf)) continue; (void) zfs_refcount_remove_many( &old_state->arcs_size, arc_buf_size(buf), buf); } ASSERT3U(bufcnt, ==, buffers); ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr)); if (hdr->b_l1hdr.b_pabd != NULL) { (void) zfs_refcount_remove_many( &old_state->arcs_size, arc_hdr_size(hdr), hdr); } if (HDR_HAS_RABD(hdr)) { (void) zfs_refcount_remove_many( &old_state->arcs_size, HDR_GET_PSIZE(hdr), hdr); } } } if (HDR_HAS_L1HDR(hdr)) { hdr->b_l1hdr.b_state = new_state; if (HDR_HAS_L2HDR(hdr) && new_state != arc_l2c_only) { l2arc_hdr_arcstats_decrement_state(hdr); hdr->b_l2hdr.b_arcs_state = new_state->arcs_state; l2arc_hdr_arcstats_increment_state(hdr); } } } void arc_space_consume(uint64_t space, arc_space_type_t type) { ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); switch (type) { default: break; case ARC_SPACE_DATA: ARCSTAT_INCR(arcstat_data_size, space); break; case ARC_SPACE_META: ARCSTAT_INCR(arcstat_metadata_size, space); break; case ARC_SPACE_BONUS: ARCSTAT_INCR(arcstat_bonus_size, space); break; case ARC_SPACE_DNODE: aggsum_add(&arc_sums.arcstat_dnode_size, space); break; case ARC_SPACE_DBUF: ARCSTAT_INCR(arcstat_dbuf_size, space); break; case ARC_SPACE_HDRS: ARCSTAT_INCR(arcstat_hdr_size, space); break; case ARC_SPACE_L2HDRS: aggsum_add(&arc_sums.arcstat_l2_hdr_size, space); break; case ARC_SPACE_ABD_CHUNK_WASTE: /* * Note: this includes space wasted by all scatter ABD's, not * just those allocated by the ARC. But the vast majority of * scatter ABD's come from the ARC, because other users are * very short-lived. */ ARCSTAT_INCR(arcstat_abd_chunk_waste_size, space); break; } if (type != ARC_SPACE_DATA && type != ARC_SPACE_ABD_CHUNK_WASTE) aggsum_add(&arc_sums.arcstat_meta_used, space); aggsum_add(&arc_sums.arcstat_size, space); } void arc_space_return(uint64_t space, arc_space_type_t type) { ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); switch (type) { default: break; case ARC_SPACE_DATA: ARCSTAT_INCR(arcstat_data_size, -space); break; case ARC_SPACE_META: ARCSTAT_INCR(arcstat_metadata_size, -space); break; case ARC_SPACE_BONUS: ARCSTAT_INCR(arcstat_bonus_size, -space); break; case ARC_SPACE_DNODE: aggsum_add(&arc_sums.arcstat_dnode_size, -space); break; case ARC_SPACE_DBUF: ARCSTAT_INCR(arcstat_dbuf_size, -space); break; case ARC_SPACE_HDRS: ARCSTAT_INCR(arcstat_hdr_size, -space); break; case ARC_SPACE_L2HDRS: aggsum_add(&arc_sums.arcstat_l2_hdr_size, -space); break; case ARC_SPACE_ABD_CHUNK_WASTE: ARCSTAT_INCR(arcstat_abd_chunk_waste_size, -space); break; } if (type != ARC_SPACE_DATA && type != ARC_SPACE_ABD_CHUNK_WASTE) { ASSERT(aggsum_compare(&arc_sums.arcstat_meta_used, space) >= 0); ARCSTAT_MAX(arcstat_meta_max, aggsum_upper_bound(&arc_sums.arcstat_meta_used)); aggsum_add(&arc_sums.arcstat_meta_used, -space); } ASSERT(aggsum_compare(&arc_sums.arcstat_size, space) >= 0); aggsum_add(&arc_sums.arcstat_size, -space); } /* * Given a hdr and a buf, returns whether that buf can share its b_data buffer * with the hdr's b_pabd. */ static boolean_t arc_can_share(arc_buf_hdr_t *hdr, arc_buf_t *buf) { /* * The criteria for sharing a hdr's data are: * 1. the buffer is not encrypted * 2. the hdr's compression matches the buf's compression * 3. the hdr doesn't need to be byteswapped * 4. the hdr isn't already being shared * 5. the buf is either compressed or it is the last buf in the hdr list * * Criterion #5 maintains the invariant that shared uncompressed * bufs must be the final buf in the hdr's b_buf list. Reading this, you * might ask, "if a compressed buf is allocated first, won't that be the * last thing in the list?", but in that case it's impossible to create * a shared uncompressed buf anyway (because the hdr must be compressed * to have the compressed buf). You might also think that #3 is * sufficient to make this guarantee, however it's possible * (specifically in the rare L2ARC write race mentioned in * arc_buf_alloc_impl()) there will be an existing uncompressed buf that * is shareable, but wasn't at the time of its allocation. Rather than * allow a new shared uncompressed buf to be created and then shuffle * the list around to make it the last element, this simply disallows * sharing if the new buf isn't the first to be added. */ ASSERT3P(buf->b_hdr, ==, hdr); boolean_t hdr_compressed = arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF; boolean_t buf_compressed = ARC_BUF_COMPRESSED(buf) != 0; return (!ARC_BUF_ENCRYPTED(buf) && buf_compressed == hdr_compressed && hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS && !HDR_SHARED_DATA(hdr) && (ARC_BUF_LAST(buf) || ARC_BUF_COMPRESSED(buf))); } /* * Allocate a buf for this hdr. If you care about the data that's in the hdr, * or if you want a compressed buffer, pass those flags in. Returns 0 if the * copy was made successfully, or an error code otherwise. */ static int arc_buf_alloc_impl(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb, const void *tag, boolean_t encrypted, boolean_t compressed, boolean_t noauth, boolean_t fill, arc_buf_t **ret) { arc_buf_t *buf; arc_fill_flags_t flags = ARC_FILL_LOCKED; ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT3U(HDR_GET_LSIZE(hdr), >, 0); VERIFY(hdr->b_type == ARC_BUFC_DATA || hdr->b_type == ARC_BUFC_METADATA); ASSERT3P(ret, !=, NULL); ASSERT3P(*ret, ==, NULL); IMPLY(encrypted, compressed); buf = *ret = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); buf->b_hdr = hdr; buf->b_data = NULL; buf->b_next = hdr->b_l1hdr.b_buf; buf->b_flags = 0; add_reference(hdr, tag); /* * We're about to change the hdr's b_flags. We must either * hold the hash_lock or be undiscoverable. */ ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); /* * Only honor requests for compressed bufs if the hdr is actually * compressed. This must be overridden if the buffer is encrypted since * encrypted buffers cannot be decompressed. */ if (encrypted) { buf->b_flags |= ARC_BUF_FLAG_COMPRESSED; buf->b_flags |= ARC_BUF_FLAG_ENCRYPTED; flags |= ARC_FILL_COMPRESSED | ARC_FILL_ENCRYPTED; } else if (compressed && arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF) { buf->b_flags |= ARC_BUF_FLAG_COMPRESSED; flags |= ARC_FILL_COMPRESSED; } if (noauth) { ASSERT0(encrypted); flags |= ARC_FILL_NOAUTH; } /* * If the hdr's data can be shared then we share the data buffer and * set the appropriate bit in the hdr's b_flags to indicate the hdr is * sharing it's b_pabd with the arc_buf_t. Otherwise, we allocate a new * buffer to store the buf's data. * * There are two additional restrictions here because we're sharing * hdr -> buf instead of the usual buf -> hdr. First, the hdr can't be * actively involved in an L2ARC write, because if this buf is used by * an arc_write() then the hdr's data buffer will be released when the * write completes, even though the L2ARC write might still be using it. * Second, the hdr's ABD must be linear so that the buf's user doesn't * need to be ABD-aware. It must be allocated via * zio_[data_]buf_alloc(), not as a page, because we need to be able * to abd_release_ownership_of_buf(), which isn't allowed on "linear * page" buffers because the ABD code needs to handle freeing them * specially. */ boolean_t can_share = arc_can_share(hdr, buf) && !HDR_L2_WRITING(hdr) && hdr->b_l1hdr.b_pabd != NULL && abd_is_linear(hdr->b_l1hdr.b_pabd) && !abd_is_linear_page(hdr->b_l1hdr.b_pabd); /* Set up b_data and sharing */ if (can_share) { buf->b_data = abd_to_buf(hdr->b_l1hdr.b_pabd); buf->b_flags |= ARC_BUF_FLAG_SHARED; arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA); } else { buf->b_data = arc_get_data_buf(hdr, arc_buf_size(buf), buf); ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf)); } VERIFY3P(buf->b_data, !=, NULL); hdr->b_l1hdr.b_buf = buf; hdr->b_l1hdr.b_bufcnt += 1; if (encrypted) hdr->b_crypt_hdr.b_ebufcnt += 1; /* * If the user wants the data from the hdr, we need to either copy or * decompress the data. */ if (fill) { ASSERT3P(zb, !=, NULL); return (arc_buf_fill(buf, spa, zb, flags)); } return (0); } static const char *arc_onloan_tag = "onloan"; static inline void arc_loaned_bytes_update(int64_t delta) { atomic_add_64(&arc_loaned_bytes, delta); /* assert that it did not wrap around */ ASSERT3S(atomic_add_64_nv(&arc_loaned_bytes, 0), >=, 0); } /* * Loan out an anonymous arc buffer. Loaned buffers are not counted as in * flight data by arc_tempreserve_space() until they are "returned". Loaned * buffers must be returned to the arc before they can be used by the DMU or * freed. */ arc_buf_t * arc_loan_buf(spa_t *spa, boolean_t is_metadata, int size) { arc_buf_t *buf = arc_alloc_buf(spa, arc_onloan_tag, is_metadata ? ARC_BUFC_METADATA : ARC_BUFC_DATA, size); arc_loaned_bytes_update(arc_buf_size(buf)); return (buf); } arc_buf_t * arc_loan_compressed_buf(spa_t *spa, uint64_t psize, uint64_t lsize, enum zio_compress compression_type, uint8_t complevel) { arc_buf_t *buf = arc_alloc_compressed_buf(spa, arc_onloan_tag, psize, lsize, compression_type, complevel); arc_loaned_bytes_update(arc_buf_size(buf)); return (buf); } arc_buf_t * arc_loan_raw_buf(spa_t *spa, uint64_t dsobj, boolean_t byteorder, const uint8_t *salt, const uint8_t *iv, const uint8_t *mac, dmu_object_type_t ot, uint64_t psize, uint64_t lsize, enum zio_compress compression_type, uint8_t complevel) { arc_buf_t *buf = arc_alloc_raw_buf(spa, arc_onloan_tag, dsobj, byteorder, salt, iv, mac, ot, psize, lsize, compression_type, complevel); atomic_add_64(&arc_loaned_bytes, psize); return (buf); } /* * Return a loaned arc buffer to the arc. */ void arc_return_buf(arc_buf_t *buf, const void *tag) { arc_buf_hdr_t *hdr = buf->b_hdr; ASSERT3P(buf->b_data, !=, NULL); ASSERT(HDR_HAS_L1HDR(hdr)); (void) zfs_refcount_add(&hdr->b_l1hdr.b_refcnt, tag); (void) zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); arc_loaned_bytes_update(-arc_buf_size(buf)); } /* Detach an arc_buf from a dbuf (tag) */ void arc_loan_inuse_buf(arc_buf_t *buf, const void *tag) { arc_buf_hdr_t *hdr = buf->b_hdr; ASSERT3P(buf->b_data, !=, NULL); ASSERT(HDR_HAS_L1HDR(hdr)); (void) zfs_refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); (void) zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, tag); arc_loaned_bytes_update(arc_buf_size(buf)); } static void l2arc_free_abd_on_write(abd_t *abd, size_t size, arc_buf_contents_t type) { l2arc_data_free_t *df = kmem_alloc(sizeof (*df), KM_SLEEP); df->l2df_abd = abd; df->l2df_size = size; df->l2df_type = type; mutex_enter(&l2arc_free_on_write_mtx); list_insert_head(l2arc_free_on_write, df); mutex_exit(&l2arc_free_on_write_mtx); } static void arc_hdr_free_on_write(arc_buf_hdr_t *hdr, boolean_t free_rdata) { arc_state_t *state = hdr->b_l1hdr.b_state; arc_buf_contents_t type = arc_buf_type(hdr); uint64_t size = (free_rdata) ? HDR_GET_PSIZE(hdr) : arc_hdr_size(hdr); /* protected by hash lock, if in the hash table */ if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); ASSERT(state != arc_anon && state != arc_l2c_only); (void) zfs_refcount_remove_many(&state->arcs_esize[type], size, hdr); } (void) zfs_refcount_remove_many(&state->arcs_size, size, hdr); if (type == ARC_BUFC_METADATA) { arc_space_return(size, ARC_SPACE_META); } else { ASSERT(type == ARC_BUFC_DATA); arc_space_return(size, ARC_SPACE_DATA); } if (free_rdata) { l2arc_free_abd_on_write(hdr->b_crypt_hdr.b_rabd, size, type); } else { l2arc_free_abd_on_write(hdr->b_l1hdr.b_pabd, size, type); } } /* * Share the arc_buf_t's data with the hdr. Whenever we are sharing the * data buffer, we transfer the refcount ownership to the hdr and update * the appropriate kstats. */ static void arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) { ASSERT(arc_can_share(hdr, buf)); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(!ARC_BUF_ENCRYPTED(buf)); ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); /* * Start sharing the data buffer. We transfer the * refcount ownership to the hdr since it always owns * the refcount whenever an arc_buf_t is shared. */ zfs_refcount_transfer_ownership_many(&hdr->b_l1hdr.b_state->arcs_size, arc_hdr_size(hdr), buf, hdr); hdr->b_l1hdr.b_pabd = abd_get_from_buf(buf->b_data, arc_buf_size(buf)); abd_take_ownership_of_buf(hdr->b_l1hdr.b_pabd, HDR_ISTYPE_METADATA(hdr)); arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA); buf->b_flags |= ARC_BUF_FLAG_SHARED; /* * Since we've transferred ownership to the hdr we need * to increment its compressed and uncompressed kstats and * decrement the overhead size. */ ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr)); ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr)); ARCSTAT_INCR(arcstat_overhead_size, -arc_buf_size(buf)); } static void arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) { ASSERT(arc_buf_is_shared(buf)); ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); /* * We are no longer sharing this buffer so we need * to transfer its ownership to the rightful owner. */ zfs_refcount_transfer_ownership_many(&hdr->b_l1hdr.b_state->arcs_size, arc_hdr_size(hdr), hdr, buf); arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); abd_release_ownership_of_buf(hdr->b_l1hdr.b_pabd); abd_free(hdr->b_l1hdr.b_pabd); hdr->b_l1hdr.b_pabd = NULL; buf->b_flags &= ~ARC_BUF_FLAG_SHARED; /* * Since the buffer is no longer shared between * the arc buf and the hdr, count it as overhead. */ ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr)); ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr)); ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf)); } /* * Remove an arc_buf_t from the hdr's buf list and return the last * arc_buf_t on the list. If no buffers remain on the list then return * NULL. */ static arc_buf_t * arc_buf_remove(arc_buf_hdr_t *hdr, arc_buf_t *buf) { ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); arc_buf_t **bufp = &hdr->b_l1hdr.b_buf; arc_buf_t *lastbuf = NULL; /* * Remove the buf from the hdr list and locate the last * remaining buffer on the list. */ while (*bufp != NULL) { if (*bufp == buf) *bufp = buf->b_next; /* * If we've removed a buffer in the middle of * the list then update the lastbuf and update * bufp. */ if (*bufp != NULL) { lastbuf = *bufp; bufp = &(*bufp)->b_next; } } buf->b_next = NULL; ASSERT3P(lastbuf, !=, buf); IMPLY(hdr->b_l1hdr.b_bufcnt > 0, lastbuf != NULL); IMPLY(hdr->b_l1hdr.b_bufcnt > 0, hdr->b_l1hdr.b_buf != NULL); IMPLY(lastbuf != NULL, ARC_BUF_LAST(lastbuf)); return (lastbuf); } /* * Free up buf->b_data and pull the arc_buf_t off of the arc_buf_hdr_t's * list and free it. */ static void arc_buf_destroy_impl(arc_buf_t *buf) { arc_buf_hdr_t *hdr = buf->b_hdr; /* * Free up the data associated with the buf but only if we're not * sharing this with the hdr. If we are sharing it with the hdr, the * hdr is responsible for doing the free. */ if (buf->b_data != NULL) { /* * We're about to change the hdr's b_flags. We must either * hold the hash_lock or be undiscoverable. */ ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); arc_cksum_verify(buf); arc_buf_unwatch(buf); if (arc_buf_is_shared(buf)) { arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); } else { uint64_t size = arc_buf_size(buf); arc_free_data_buf(hdr, buf->b_data, size, buf); ARCSTAT_INCR(arcstat_overhead_size, -size); } buf->b_data = NULL; ASSERT(hdr->b_l1hdr.b_bufcnt > 0); hdr->b_l1hdr.b_bufcnt -= 1; if (ARC_BUF_ENCRYPTED(buf)) { hdr->b_crypt_hdr.b_ebufcnt -= 1; /* * If we have no more encrypted buffers and we've * already gotten a copy of the decrypted data we can * free b_rabd to save some space. */ if (hdr->b_crypt_hdr.b_ebufcnt == 0 && HDR_HAS_RABD(hdr) && hdr->b_l1hdr.b_pabd != NULL && !HDR_IO_IN_PROGRESS(hdr)) { arc_hdr_free_abd(hdr, B_TRUE); } } } arc_buf_t *lastbuf = arc_buf_remove(hdr, buf); if (ARC_BUF_SHARED(buf) && !ARC_BUF_COMPRESSED(buf)) { /* * If the current arc_buf_t is sharing its data buffer with the * hdr, then reassign the hdr's b_pabd to share it with the new * buffer at the end of the list. The shared buffer is always * the last one on the hdr's buffer list. * * There is an equivalent case for compressed bufs, but since * they aren't guaranteed to be the last buf in the list and * that is an exceedingly rare case, we just allow that space be * wasted temporarily. We must also be careful not to share * encrypted buffers, since they cannot be shared. */ if (lastbuf != NULL && !ARC_BUF_ENCRYPTED(lastbuf)) { /* Only one buf can be shared at once */ VERIFY(!arc_buf_is_shared(lastbuf)); /* hdr is uncompressed so can't have compressed buf */ VERIFY(!ARC_BUF_COMPRESSED(lastbuf)); ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); arc_hdr_free_abd(hdr, B_FALSE); /* * We must setup a new shared block between the * last buffer and the hdr. The data would have * been allocated by the arc buf so we need to transfer * ownership to the hdr since it's now being shared. */ arc_share_buf(hdr, lastbuf); } } else if (HDR_SHARED_DATA(hdr)) { /* * Uncompressed shared buffers are always at the end * of the list. Compressed buffers don't have the * same requirements. This makes it hard to * simply assert that the lastbuf is shared so * we rely on the hdr's compression flags to determine * if we have a compressed, shared buffer. */ ASSERT3P(lastbuf, !=, NULL); ASSERT(arc_buf_is_shared(lastbuf) || arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF); } /* * Free the checksum if we're removing the last uncompressed buf from * this hdr. */ if (!arc_hdr_has_uncompressed_buf(hdr)) { arc_cksum_free(hdr); } /* clean up the buf */ buf->b_hdr = NULL; kmem_cache_free(buf_cache, buf); } static void arc_hdr_alloc_abd(arc_buf_hdr_t *hdr, int alloc_flags) { uint64_t size; boolean_t alloc_rdata = ((alloc_flags & ARC_HDR_ALLOC_RDATA) != 0); ASSERT3U(HDR_GET_LSIZE(hdr), >, 0); ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(!HDR_SHARED_DATA(hdr) || alloc_rdata); IMPLY(alloc_rdata, HDR_PROTECTED(hdr)); if (alloc_rdata) { size = HDR_GET_PSIZE(hdr); ASSERT3P(hdr->b_crypt_hdr.b_rabd, ==, NULL); hdr->b_crypt_hdr.b_rabd = arc_get_data_abd(hdr, size, hdr, alloc_flags); ASSERT3P(hdr->b_crypt_hdr.b_rabd, !=, NULL); ARCSTAT_INCR(arcstat_raw_size, size); } else { size = arc_hdr_size(hdr); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); hdr->b_l1hdr.b_pabd = arc_get_data_abd(hdr, size, hdr, alloc_flags); ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); } ARCSTAT_INCR(arcstat_compressed_size, size); ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr)); } static void arc_hdr_free_abd(arc_buf_hdr_t *hdr, boolean_t free_rdata) { uint64_t size = (free_rdata) ? HDR_GET_PSIZE(hdr) : arc_hdr_size(hdr); ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr)); IMPLY(free_rdata, HDR_HAS_RABD(hdr)); /* * If the hdr is currently being written to the l2arc then * we defer freeing the data by adding it to the l2arc_free_on_write * list. The l2arc will free the data once it's finished * writing it to the l2arc device. */ if (HDR_L2_WRITING(hdr)) { arc_hdr_free_on_write(hdr, free_rdata); ARCSTAT_BUMP(arcstat_l2_free_on_write); } else if (free_rdata) { arc_free_data_abd(hdr, hdr->b_crypt_hdr.b_rabd, size, hdr); } else { arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd, size, hdr); } if (free_rdata) { hdr->b_crypt_hdr.b_rabd = NULL; ARCSTAT_INCR(arcstat_raw_size, -size); } else { hdr->b_l1hdr.b_pabd = NULL; } if (hdr->b_l1hdr.b_pabd == NULL && !HDR_HAS_RABD(hdr)) hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; ARCSTAT_INCR(arcstat_compressed_size, -size); ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr)); } /* * Allocate empty anonymous ARC header. The header will get its identity * assigned and buffers attached later as part of read or write operations. * * In case of read arc_read() assigns header its identify (b_dva + b_birth), * inserts it into ARC hash to become globally visible and allocates physical * (b_pabd) or raw (b_rabd) ABD buffer to read into from disk. On disk read * completion arc_read_done() allocates ARC buffer(s) as needed, potentially * sharing one of them with the physical ABD buffer. * * In case of write arc_alloc_buf() allocates ARC buffer to be filled with * data. Then after compression and/or encryption arc_write_ready() allocates * and fills (or potentially shares) physical (b_pabd) or raw (b_rabd) ABD * buffer. On disk write completion arc_write_done() assigns the header its * new identity (b_dva + b_birth) and inserts into ARC hash. * * In case of partial overwrite the old data is read first as described. Then * arc_release() either allocates new anonymous ARC header and moves the ARC * buffer to it, or reuses the old ARC header by discarding its identity and * removing it from ARC hash. After buffer modification normal write process * follows as described. */ static arc_buf_hdr_t * arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize, boolean_t protected, enum zio_compress compression_type, uint8_t complevel, arc_buf_contents_t type) { arc_buf_hdr_t *hdr; VERIFY(type == ARC_BUFC_DATA || type == ARC_BUFC_METADATA); if (protected) { hdr = kmem_cache_alloc(hdr_full_crypt_cache, KM_PUSHPAGE); } else { hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); } ASSERT(HDR_EMPTY(hdr)); ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); HDR_SET_PSIZE(hdr, psize); HDR_SET_LSIZE(hdr, lsize); hdr->b_spa = spa; hdr->b_type = type; hdr->b_flags = 0; arc_hdr_set_flags(hdr, arc_bufc_to_flags(type) | ARC_FLAG_HAS_L1HDR); arc_hdr_set_compress(hdr, compression_type); hdr->b_complevel = complevel; if (protected) arc_hdr_set_flags(hdr, ARC_FLAG_PROTECTED); hdr->b_l1hdr.b_state = arc_anon; hdr->b_l1hdr.b_arc_access = 0; hdr->b_l1hdr.b_mru_hits = 0; hdr->b_l1hdr.b_mru_ghost_hits = 0; hdr->b_l1hdr.b_mfu_hits = 0; hdr->b_l1hdr.b_mfu_ghost_hits = 0; hdr->b_l1hdr.b_bufcnt = 0; hdr->b_l1hdr.b_buf = NULL; ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); return (hdr); } /* * Transition between the two allocation states for the arc_buf_hdr struct. * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller * version is used when a cache buffer is only in the L2ARC in order to reduce * memory usage. */ static arc_buf_hdr_t * arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new) { ASSERT(HDR_HAS_L2HDR(hdr)); arc_buf_hdr_t *nhdr; l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) || (old == hdr_l2only_cache && new == hdr_full_cache)); /* * if the caller wanted a new full header and the header is to be * encrypted we will actually allocate the header from the full crypt * cache instead. The same applies to freeing from the old cache. */ if (HDR_PROTECTED(hdr) && new == hdr_full_cache) new = hdr_full_crypt_cache; if (HDR_PROTECTED(hdr) && old == hdr_full_cache) old = hdr_full_crypt_cache; nhdr = kmem_cache_alloc(new, KM_PUSHPAGE); ASSERT(MUTEX_HELD(HDR_LOCK(hdr))); buf_hash_remove(hdr); memcpy(nhdr, hdr, HDR_L2ONLY_SIZE); if (new == hdr_full_cache || new == hdr_full_crypt_cache) { arc_hdr_set_flags(nhdr, ARC_FLAG_HAS_L1HDR); /* * arc_access and arc_change_state need to be aware that a * header has just come out of L2ARC, so we set its state to * l2c_only even though it's about to change. */ nhdr->b_l1hdr.b_state = arc_l2c_only; /* Verify previous threads set to NULL before freeing */ ASSERT3P(nhdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(!HDR_HAS_RABD(hdr)); } else { ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); ASSERT0(hdr->b_l1hdr.b_bufcnt); ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); /* * If we've reached here, We must have been called from * arc_evict_hdr(), as such we should have already been * removed from any ghost list we were previously on * (which protects us from racing with arc_evict_state), * thus no locking is needed during this check. */ ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); /* * A buffer must not be moved into the arc_l2c_only * state if it's not finished being written out to the * l2arc device. Otherwise, the b_l1hdr.b_pabd field * might try to be accessed, even though it was removed. */ VERIFY(!HDR_L2_WRITING(hdr)); VERIFY3P(hdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(!HDR_HAS_RABD(hdr)); arc_hdr_clear_flags(nhdr, ARC_FLAG_HAS_L1HDR); } /* * The header has been reallocated so we need to re-insert it into any * lists it was on. */ (void) buf_hash_insert(nhdr, NULL); ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node)); mutex_enter(&dev->l2ad_mtx); /* * We must place the realloc'ed header back into the list at * the same spot. Otherwise, if it's placed earlier in the list, * l2arc_write_buffers() could find it during the function's * write phase, and try to write it out to the l2arc. */ list_insert_after(&dev->l2ad_buflist, hdr, nhdr); list_remove(&dev->l2ad_buflist, hdr); mutex_exit(&dev->l2ad_mtx); /* * Since we're using the pointer address as the tag when * incrementing and decrementing the l2ad_alloc refcount, we * must remove the old pointer (that we're about to destroy) and * add the new pointer to the refcount. Otherwise we'd remove * the wrong pointer address when calling arc_hdr_destroy() later. */ (void) zfs_refcount_remove_many(&dev->l2ad_alloc, arc_hdr_size(hdr), hdr); (void) zfs_refcount_add_many(&dev->l2ad_alloc, arc_hdr_size(nhdr), nhdr); buf_discard_identity(hdr); kmem_cache_free(old, hdr); return (nhdr); } /* * This function allows an L1 header to be reallocated as a crypt * header and vice versa. If we are going to a crypt header, the * new fields will be zeroed out. */ static arc_buf_hdr_t * arc_hdr_realloc_crypt(arc_buf_hdr_t *hdr, boolean_t need_crypt) { arc_buf_hdr_t *nhdr; arc_buf_t *buf; kmem_cache_t *ncache, *ocache; /* * This function requires that hdr is in the arc_anon state. * Therefore it won't have any L2ARC data for us to worry * about copying. */ ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(!HDR_HAS_L2HDR(hdr)); ASSERT3U(!!HDR_PROTECTED(hdr), !=, need_crypt); ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); ASSERT(!list_link_active(&hdr->b_l2hdr.b_l2node)); ASSERT3P(hdr->b_hash_next, ==, NULL); if (need_crypt) { ncache = hdr_full_crypt_cache; ocache = hdr_full_cache; } else { ncache = hdr_full_cache; ocache = hdr_full_crypt_cache; } nhdr = kmem_cache_alloc(ncache, KM_PUSHPAGE); /* * Copy all members that aren't locks or condvars to the new header. * No lists are pointing to us (as we asserted above), so we don't * need to worry about the list nodes. */ nhdr->b_dva = hdr->b_dva; nhdr->b_birth = hdr->b_birth; nhdr->b_type = hdr->b_type; nhdr->b_flags = hdr->b_flags; nhdr->b_psize = hdr->b_psize; nhdr->b_lsize = hdr->b_lsize; nhdr->b_spa = hdr->b_spa; nhdr->b_l1hdr.b_freeze_cksum = hdr->b_l1hdr.b_freeze_cksum; nhdr->b_l1hdr.b_bufcnt = hdr->b_l1hdr.b_bufcnt; nhdr->b_l1hdr.b_byteswap = hdr->b_l1hdr.b_byteswap; nhdr->b_l1hdr.b_state = hdr->b_l1hdr.b_state; nhdr->b_l1hdr.b_arc_access = hdr->b_l1hdr.b_arc_access; nhdr->b_l1hdr.b_mru_hits = hdr->b_l1hdr.b_mru_hits; nhdr->b_l1hdr.b_mru_ghost_hits = hdr->b_l1hdr.b_mru_ghost_hits; nhdr->b_l1hdr.b_mfu_hits = hdr->b_l1hdr.b_mfu_hits; nhdr->b_l1hdr.b_mfu_ghost_hits = hdr->b_l1hdr.b_mfu_ghost_hits; nhdr->b_l1hdr.b_acb = hdr->b_l1hdr.b_acb; nhdr->b_l1hdr.b_pabd = hdr->b_l1hdr.b_pabd; /* * This zfs_refcount_add() exists only to ensure that the individual * arc buffers always point to a header that is referenced, avoiding * a small race condition that could trigger ASSERTs. */ (void) zfs_refcount_add(&nhdr->b_l1hdr.b_refcnt, FTAG); nhdr->b_l1hdr.b_buf = hdr->b_l1hdr.b_buf; for (buf = nhdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) { mutex_enter(&buf->b_evict_lock); buf->b_hdr = nhdr; mutex_exit(&buf->b_evict_lock); } zfs_refcount_transfer(&nhdr->b_l1hdr.b_refcnt, &hdr->b_l1hdr.b_refcnt); (void) zfs_refcount_remove(&nhdr->b_l1hdr.b_refcnt, FTAG); ASSERT0(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt)); if (need_crypt) { arc_hdr_set_flags(nhdr, ARC_FLAG_PROTECTED); } else { arc_hdr_clear_flags(nhdr, ARC_FLAG_PROTECTED); } /* unset all members of the original hdr */ memset(&hdr->b_dva, 0, sizeof (dva_t)); hdr->b_birth = 0; hdr->b_type = ARC_BUFC_INVALID; hdr->b_flags = 0; hdr->b_psize = 0; hdr->b_lsize = 0; hdr->b_spa = 0; hdr->b_l1hdr.b_freeze_cksum = NULL; hdr->b_l1hdr.b_buf = NULL; hdr->b_l1hdr.b_bufcnt = 0; hdr->b_l1hdr.b_byteswap = 0; hdr->b_l1hdr.b_state = NULL; hdr->b_l1hdr.b_arc_access = 0; hdr->b_l1hdr.b_mru_hits = 0; hdr->b_l1hdr.b_mru_ghost_hits = 0; hdr->b_l1hdr.b_mfu_hits = 0; hdr->b_l1hdr.b_mfu_ghost_hits = 0; hdr->b_l1hdr.b_acb = NULL; hdr->b_l1hdr.b_pabd = NULL; if (ocache == hdr_full_crypt_cache) { ASSERT(!HDR_HAS_RABD(hdr)); hdr->b_crypt_hdr.b_ot = DMU_OT_NONE; hdr->b_crypt_hdr.b_ebufcnt = 0; hdr->b_crypt_hdr.b_dsobj = 0; memset(hdr->b_crypt_hdr.b_salt, 0, ZIO_DATA_SALT_LEN); memset(hdr->b_crypt_hdr.b_iv, 0, ZIO_DATA_IV_LEN); memset(hdr->b_crypt_hdr.b_mac, 0, ZIO_DATA_MAC_LEN); } buf_discard_identity(hdr); kmem_cache_free(ocache, hdr); return (nhdr); } /* * This function is used by the send / receive code to convert a newly * allocated arc_buf_t to one that is suitable for a raw encrypted write. It * is also used to allow the root objset block to be updated without altering * its embedded MACs. Both block types will always be uncompressed so we do not * have to worry about compression type or psize. */ void arc_convert_to_raw(arc_buf_t *buf, uint64_t dsobj, boolean_t byteorder, dmu_object_type_t ot, const uint8_t *salt, const uint8_t *iv, const uint8_t *mac) { arc_buf_hdr_t *hdr = buf->b_hdr; ASSERT(ot == DMU_OT_DNODE || ot == DMU_OT_OBJSET); ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); buf->b_flags |= (ARC_BUF_FLAG_COMPRESSED | ARC_BUF_FLAG_ENCRYPTED); if (!HDR_PROTECTED(hdr)) hdr = arc_hdr_realloc_crypt(hdr, B_TRUE); hdr->b_crypt_hdr.b_dsobj = dsobj; hdr->b_crypt_hdr.b_ot = ot; hdr->b_l1hdr.b_byteswap = (byteorder == ZFS_HOST_BYTEORDER) ? DMU_BSWAP_NUMFUNCS : DMU_OT_BYTESWAP(ot); if (!arc_hdr_has_uncompressed_buf(hdr)) arc_cksum_free(hdr); if (salt != NULL) memcpy(hdr->b_crypt_hdr.b_salt, salt, ZIO_DATA_SALT_LEN); if (iv != NULL) memcpy(hdr->b_crypt_hdr.b_iv, iv, ZIO_DATA_IV_LEN); if (mac != NULL) memcpy(hdr->b_crypt_hdr.b_mac, mac, ZIO_DATA_MAC_LEN); } /* * Allocate a new arc_buf_hdr_t and arc_buf_t and return the buf to the caller. * The buf is returned thawed since we expect the consumer to modify it. */ arc_buf_t * arc_alloc_buf(spa_t *spa, const void *tag, arc_buf_contents_t type, int32_t size) { arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), size, size, B_FALSE, ZIO_COMPRESS_OFF, 0, type); arc_buf_t *buf = NULL; VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_FALSE, B_FALSE, B_FALSE, B_FALSE, &buf)); arc_buf_thaw(buf); return (buf); } /* * Allocate a compressed buf in the same manner as arc_alloc_buf. Don't use this * for bufs containing metadata. */ arc_buf_t * arc_alloc_compressed_buf(spa_t *spa, const void *tag, uint64_t psize, uint64_t lsize, enum zio_compress compression_type, uint8_t complevel) { ASSERT3U(lsize, >, 0); ASSERT3U(lsize, >=, psize); ASSERT3U(compression_type, >, ZIO_COMPRESS_OFF); ASSERT3U(compression_type, <, ZIO_COMPRESS_FUNCTIONS); arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, B_FALSE, compression_type, complevel, ARC_BUFC_DATA); arc_buf_t *buf = NULL; VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_FALSE, B_TRUE, B_FALSE, B_FALSE, &buf)); arc_buf_thaw(buf); ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); /* * To ensure that the hdr has the correct data in it if we call * arc_untransform() on this buf before it's been written to disk, * it's easiest if we just set up sharing between the buf and the hdr. */ arc_share_buf(hdr, buf); return (buf); } arc_buf_t * arc_alloc_raw_buf(spa_t *spa, const void *tag, uint64_t dsobj, boolean_t byteorder, const uint8_t *salt, const uint8_t *iv, const uint8_t *mac, dmu_object_type_t ot, uint64_t psize, uint64_t lsize, enum zio_compress compression_type, uint8_t complevel) { arc_buf_hdr_t *hdr; arc_buf_t *buf; arc_buf_contents_t type = DMU_OT_IS_METADATA(ot) ? ARC_BUFC_METADATA : ARC_BUFC_DATA; ASSERT3U(lsize, >, 0); ASSERT3U(lsize, >=, psize); ASSERT3U(compression_type, >=, ZIO_COMPRESS_OFF); ASSERT3U(compression_type, <, ZIO_COMPRESS_FUNCTIONS); hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, B_TRUE, compression_type, complevel, type); hdr->b_crypt_hdr.b_dsobj = dsobj; hdr->b_crypt_hdr.b_ot = ot; hdr->b_l1hdr.b_byteswap = (byteorder == ZFS_HOST_BYTEORDER) ? DMU_BSWAP_NUMFUNCS : DMU_OT_BYTESWAP(ot); memcpy(hdr->b_crypt_hdr.b_salt, salt, ZIO_DATA_SALT_LEN); memcpy(hdr->b_crypt_hdr.b_iv, iv, ZIO_DATA_IV_LEN); memcpy(hdr->b_crypt_hdr.b_mac, mac, ZIO_DATA_MAC_LEN); /* * This buffer will be considered encrypted even if the ot is not an * encrypted type. It will become authenticated instead in * arc_write_ready(). */ buf = NULL; VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_TRUE, B_TRUE, B_FALSE, B_FALSE, &buf)); arc_buf_thaw(buf); ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); return (buf); } static void l2arc_hdr_arcstats_update(arc_buf_hdr_t *hdr, boolean_t incr, boolean_t state_only) { l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr; l2arc_dev_t *dev = l2hdr->b_dev; uint64_t lsize = HDR_GET_LSIZE(hdr); uint64_t psize = HDR_GET_PSIZE(hdr); uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev, psize); arc_buf_contents_t type = hdr->b_type; int64_t lsize_s; int64_t psize_s; int64_t asize_s; if (incr) { lsize_s = lsize; psize_s = psize; asize_s = asize; } else { lsize_s = -lsize; psize_s = -psize; asize_s = -asize; } /* If the buffer is a prefetch, count it as such. */ if (HDR_PREFETCH(hdr)) { ARCSTAT_INCR(arcstat_l2_prefetch_asize, asize_s); } else { /* * We use the value stored in the L2 header upon initial * caching in L2ARC. This value will be updated in case * an MRU/MRU_ghost buffer transitions to MFU but the L2ARC * metadata (log entry) cannot currently be updated. Having * the ARC state in the L2 header solves the problem of a * possibly absent L1 header (apparent in buffers restored * from persistent L2ARC). */ switch (hdr->b_l2hdr.b_arcs_state) { case ARC_STATE_MRU_GHOST: case ARC_STATE_MRU: ARCSTAT_INCR(arcstat_l2_mru_asize, asize_s); break; case ARC_STATE_MFU_GHOST: case ARC_STATE_MFU: ARCSTAT_INCR(arcstat_l2_mfu_asize, asize_s); break; default: break; } } if (state_only) return; ARCSTAT_INCR(arcstat_l2_psize, psize_s); ARCSTAT_INCR(arcstat_l2_lsize, lsize_s); switch (type) { case ARC_BUFC_DATA: ARCSTAT_INCR(arcstat_l2_bufc_data_asize, asize_s); break; case ARC_BUFC_METADATA: ARCSTAT_INCR(arcstat_l2_bufc_metadata_asize, asize_s); break; default: break; } } static void arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr) { l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr; l2arc_dev_t *dev = l2hdr->b_dev; uint64_t psize = HDR_GET_PSIZE(hdr); uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev, psize); ASSERT(MUTEX_HELD(&dev->l2ad_mtx)); ASSERT(HDR_HAS_L2HDR(hdr)); list_remove(&dev->l2ad_buflist, hdr); l2arc_hdr_arcstats_decrement(hdr); vdev_space_update(dev->l2ad_vdev, -asize, 0, 0); (void) zfs_refcount_remove_many(&dev->l2ad_alloc, arc_hdr_size(hdr), hdr); arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR); } static void arc_hdr_destroy(arc_buf_hdr_t *hdr) { if (HDR_HAS_L1HDR(hdr)) { ASSERT(hdr->b_l1hdr.b_buf == NULL || hdr->b_l1hdr.b_bufcnt > 0); ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); } ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT(!HDR_IN_HASH_TABLE(hdr)); if (HDR_HAS_L2HDR(hdr)) { l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; boolean_t buflist_held = MUTEX_HELD(&dev->l2ad_mtx); if (!buflist_held) mutex_enter(&dev->l2ad_mtx); /* * Even though we checked this conditional above, we * need to check this again now that we have the * l2ad_mtx. This is because we could be racing with * another thread calling l2arc_evict() which might have * destroyed this header's L2 portion as we were waiting * to acquire the l2ad_mtx. If that happens, we don't * want to re-destroy the header's L2 portion. */ if (HDR_HAS_L2HDR(hdr)) { if (!HDR_EMPTY(hdr)) buf_discard_identity(hdr); arc_hdr_l2hdr_destroy(hdr); } if (!buflist_held) mutex_exit(&dev->l2ad_mtx); } /* * The header's identify can only be safely discarded once it is no * longer discoverable. This requires removing it from the hash table * and the l2arc header list. After this point the hash lock can not * be used to protect the header. */ if (!HDR_EMPTY(hdr)) buf_discard_identity(hdr); if (HDR_HAS_L1HDR(hdr)) { arc_cksum_free(hdr); while (hdr->b_l1hdr.b_buf != NULL) arc_buf_destroy_impl(hdr->b_l1hdr.b_buf); if (hdr->b_l1hdr.b_pabd != NULL) arc_hdr_free_abd(hdr, B_FALSE); if (HDR_HAS_RABD(hdr)) arc_hdr_free_abd(hdr, B_TRUE); } ASSERT3P(hdr->b_hash_next, ==, NULL); if (HDR_HAS_L1HDR(hdr)) { ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); if (!HDR_PROTECTED(hdr)) { kmem_cache_free(hdr_full_cache, hdr); } else { kmem_cache_free(hdr_full_crypt_cache, hdr); } } else { kmem_cache_free(hdr_l2only_cache, hdr); } } void arc_buf_destroy(arc_buf_t *buf, const void *tag) { arc_buf_hdr_t *hdr = buf->b_hdr; if (hdr->b_l1hdr.b_state == arc_anon) { ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); VERIFY0(remove_reference(hdr, NULL, tag)); arc_hdr_destroy(hdr); return; } kmutex_t *hash_lock = HDR_LOCK(hdr); mutex_enter(hash_lock); ASSERT3P(hdr, ==, buf->b_hdr); ASSERT(hdr->b_l1hdr.b_bufcnt > 0); ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); ASSERT3P(hdr->b_l1hdr.b_state, !=, arc_anon); ASSERT3P(buf->b_data, !=, NULL); (void) remove_reference(hdr, hash_lock, tag); arc_buf_destroy_impl(buf); mutex_exit(hash_lock); } /* * Evict the arc_buf_hdr that is provided as a parameter. The resultant * state of the header is dependent on its state prior to entering this * function. The following transitions are possible: * * - arc_mru -> arc_mru_ghost * - arc_mfu -> arc_mfu_ghost * - arc_mru_ghost -> arc_l2c_only * - arc_mru_ghost -> deleted * - arc_mfu_ghost -> arc_l2c_only * - arc_mfu_ghost -> deleted * * Return total size of evicted data buffers for eviction progress tracking. * When evicting from ghost states return logical buffer size to make eviction * progress at the same (or at least comparable) rate as from non-ghost states. * * Return *real_evicted for actual ARC size reduction to wake up threads * waiting for it. For non-ghost states it includes size of evicted data * buffers (the headers are not freed there). For ghost states it includes * only the evicted headers size. */ static int64_t arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, uint64_t *real_evicted) { arc_state_t *evicted_state, *state; int64_t bytes_evicted = 0; uint_t min_lifetime = HDR_PRESCIENT_PREFETCH(hdr) ? arc_min_prescient_prefetch_ms : arc_min_prefetch_ms; ASSERT(MUTEX_HELD(hash_lock)); ASSERT(HDR_HAS_L1HDR(hdr)); *real_evicted = 0; state = hdr->b_l1hdr.b_state; if (GHOST_STATE(state)) { ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); /* * l2arc_write_buffers() relies on a header's L1 portion * (i.e. its b_pabd field) during it's write phase. * Thus, we cannot push a header onto the arc_l2c_only * state (removing its L1 piece) until the header is * done being written to the l2arc. */ if (HDR_HAS_L2HDR(hdr) && HDR_L2_WRITING(hdr)) { ARCSTAT_BUMP(arcstat_evict_l2_skip); return (bytes_evicted); } ARCSTAT_BUMP(arcstat_deleted); bytes_evicted += HDR_GET_LSIZE(hdr); DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr); if (HDR_HAS_L2HDR(hdr)) { ASSERT(hdr->b_l1hdr.b_pabd == NULL); ASSERT(!HDR_HAS_RABD(hdr)); /* * This buffer is cached on the 2nd Level ARC; * don't destroy the header. */ arc_change_state(arc_l2c_only, hdr, hash_lock); /* * dropping from L1+L2 cached to L2-only, * realloc to remove the L1 header. */ - hdr = arc_hdr_realloc(hdr, hdr_full_cache, + (void) arc_hdr_realloc(hdr, hdr_full_cache, hdr_l2only_cache); *real_evicted += HDR_FULL_SIZE - HDR_L2ONLY_SIZE; } else { arc_change_state(arc_anon, hdr, hash_lock); arc_hdr_destroy(hdr); *real_evicted += HDR_FULL_SIZE; } return (bytes_evicted); } ASSERT(state == arc_mru || state == arc_mfu); evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; /* prefetch buffers have a minimum lifespan */ if (HDR_IO_IN_PROGRESS(hdr) || ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) && ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access < MSEC_TO_TICK(min_lifetime))) { ARCSTAT_BUMP(arcstat_evict_skip); return (bytes_evicted); } ASSERT0(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt)); while (hdr->b_l1hdr.b_buf) { arc_buf_t *buf = hdr->b_l1hdr.b_buf; if (!mutex_tryenter(&buf->b_evict_lock)) { ARCSTAT_BUMP(arcstat_mutex_miss); break; } if (buf->b_data != NULL) { bytes_evicted += HDR_GET_LSIZE(hdr); *real_evicted += HDR_GET_LSIZE(hdr); } mutex_exit(&buf->b_evict_lock); arc_buf_destroy_impl(buf); } if (HDR_HAS_L2HDR(hdr)) { ARCSTAT_INCR(arcstat_evict_l2_cached, HDR_GET_LSIZE(hdr)); } else { if (l2arc_write_eligible(hdr->b_spa, hdr)) { ARCSTAT_INCR(arcstat_evict_l2_eligible, HDR_GET_LSIZE(hdr)); switch (state->arcs_state) { case ARC_STATE_MRU: ARCSTAT_INCR( arcstat_evict_l2_eligible_mru, HDR_GET_LSIZE(hdr)); break; case ARC_STATE_MFU: ARCSTAT_INCR( arcstat_evict_l2_eligible_mfu, HDR_GET_LSIZE(hdr)); break; default: break; } } else { ARCSTAT_INCR(arcstat_evict_l2_ineligible, HDR_GET_LSIZE(hdr)); } } if (hdr->b_l1hdr.b_bufcnt == 0) { arc_cksum_free(hdr); bytes_evicted += arc_hdr_size(hdr); *real_evicted += arc_hdr_size(hdr); /* * If this hdr is being evicted and has a compressed * buffer then we discard it here before we change states. * This ensures that the accounting is updated correctly * in arc_free_data_impl(). */ if (hdr->b_l1hdr.b_pabd != NULL) arc_hdr_free_abd(hdr, B_FALSE); if (HDR_HAS_RABD(hdr)) arc_hdr_free_abd(hdr, B_TRUE); arc_change_state(evicted_state, hdr, hash_lock); ASSERT(HDR_IN_HASH_TABLE(hdr)); arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE); DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr); } return (bytes_evicted); } static void arc_set_need_free(void) { ASSERT(MUTEX_HELD(&arc_evict_lock)); int64_t remaining = arc_free_memory() - arc_sys_free / 2; arc_evict_waiter_t *aw = list_tail(&arc_evict_waiters); if (aw == NULL) { arc_need_free = MAX(-remaining, 0); } else { arc_need_free = MAX(-remaining, (int64_t)(aw->aew_count - arc_evict_count)); } } static uint64_t arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker, uint64_t spa, uint64_t bytes) { multilist_sublist_t *mls; uint64_t bytes_evicted = 0, real_evicted = 0; arc_buf_hdr_t *hdr; kmutex_t *hash_lock; uint_t evict_count = zfs_arc_evict_batch_limit; ASSERT3P(marker, !=, NULL); mls = multilist_sublist_lock(ml, idx); for (hdr = multilist_sublist_prev(mls, marker); likely(hdr != NULL); hdr = multilist_sublist_prev(mls, marker)) { if ((evict_count == 0) || (bytes_evicted >= bytes)) break; /* * To keep our iteration location, move the marker * forward. Since we're not holding hdr's hash lock, we * must be very careful and not remove 'hdr' from the * sublist. Otherwise, other consumers might mistake the * 'hdr' as not being on a sublist when they call the * multilist_link_active() function (they all rely on * the hash lock protecting concurrent insertions and * removals). multilist_sublist_move_forward() was * specifically implemented to ensure this is the case * (only 'marker' will be removed and re-inserted). */ multilist_sublist_move_forward(mls, marker); /* * The only case where the b_spa field should ever be * zero, is the marker headers inserted by * arc_evict_state(). It's possible for multiple threads * to be calling arc_evict_state() concurrently (e.g. * dsl_pool_close() and zio_inject_fault()), so we must * skip any markers we see from these other threads. */ if (hdr->b_spa == 0) continue; /* we're only interested in evicting buffers of a certain spa */ if (spa != 0 && hdr->b_spa != spa) { ARCSTAT_BUMP(arcstat_evict_skip); continue; } hash_lock = HDR_LOCK(hdr); /* * We aren't calling this function from any code path * that would already be holding a hash lock, so we're * asserting on this assumption to be defensive in case * this ever changes. Without this check, it would be * possible to incorrectly increment arcstat_mutex_miss * below (e.g. if the code changed such that we called * this function with a hash lock held). */ ASSERT(!MUTEX_HELD(hash_lock)); if (mutex_tryenter(hash_lock)) { uint64_t revicted; uint64_t evicted = arc_evict_hdr(hdr, hash_lock, &revicted); mutex_exit(hash_lock); bytes_evicted += evicted; real_evicted += revicted; /* * If evicted is zero, arc_evict_hdr() must have * decided to skip this header, don't increment * evict_count in this case. */ if (evicted != 0) evict_count--; } else { ARCSTAT_BUMP(arcstat_mutex_miss); } } multilist_sublist_unlock(mls); /* * Increment the count of evicted bytes, and wake up any threads that * are waiting for the count to reach this value. Since the list is * ordered by ascending aew_count, we pop off the beginning of the * list until we reach the end, or a waiter that's past the current * "count". Doing this outside the loop reduces the number of times * we need to acquire the global arc_evict_lock. * * Only wake when there's sufficient free memory in the system * (specifically, arc_sys_free/2, which by default is a bit more than * 1/64th of RAM). See the comments in arc_wait_for_eviction(). */ mutex_enter(&arc_evict_lock); arc_evict_count += real_evicted; if (arc_free_memory() > arc_sys_free / 2) { arc_evict_waiter_t *aw; while ((aw = list_head(&arc_evict_waiters)) != NULL && aw->aew_count <= arc_evict_count) { list_remove(&arc_evict_waiters, aw); cv_broadcast(&aw->aew_cv); } } arc_set_need_free(); mutex_exit(&arc_evict_lock); /* * If the ARC size is reduced from arc_c_max to arc_c_min (especially * if the average cached block is small), eviction can be on-CPU for * many seconds. To ensure that other threads that may be bound to * this CPU are able to make progress, make a voluntary preemption * call here. */ kpreempt(KPREEMPT_SYNC); return (bytes_evicted); } /* * Allocate an array of buffer headers used as placeholders during arc state * eviction. */ static arc_buf_hdr_t ** arc_state_alloc_markers(int count) { arc_buf_hdr_t **markers; markers = kmem_zalloc(sizeof (*markers) * count, KM_SLEEP); for (int i = 0; i < count; i++) { markers[i] = kmem_cache_alloc(hdr_full_cache, KM_SLEEP); /* * A b_spa of 0 is used to indicate that this header is * a marker. This fact is used in arc_evict_type() and * arc_evict_state_impl(). */ markers[i]->b_spa = 0; } return (markers); } static void arc_state_free_markers(arc_buf_hdr_t **markers, int count) { for (int i = 0; i < count; i++) kmem_cache_free(hdr_full_cache, markers[i]); kmem_free(markers, sizeof (*markers) * count); } /* * Evict buffers from the given arc state, until we've removed the * specified number of bytes. Move the removed buffers to the * appropriate evict state. * * This function makes a "best effort". It skips over any buffers * it can't get a hash_lock on, and so, may not catch all candidates. * It may also return without evicting as much space as requested. * * If bytes is specified using the special value ARC_EVICT_ALL, this * will evict all available (i.e. unlocked and evictable) buffers from * the given arc state; which is used by arc_flush(). */ static uint64_t arc_evict_state(arc_state_t *state, uint64_t spa, uint64_t bytes, arc_buf_contents_t type) { uint64_t total_evicted = 0; multilist_t *ml = &state->arcs_list[type]; int num_sublists; arc_buf_hdr_t **markers; num_sublists = multilist_get_num_sublists(ml); /* * If we've tried to evict from each sublist, made some * progress, but still have not hit the target number of bytes * to evict, we want to keep trying. The markers allow us to * pick up where we left off for each individual sublist, rather * than starting from the tail each time. */ if (zthr_iscurthread(arc_evict_zthr)) { markers = arc_state_evict_markers; ASSERT3S(num_sublists, <=, arc_state_evict_marker_count); } else { markers = arc_state_alloc_markers(num_sublists); } for (int i = 0; i < num_sublists; i++) { multilist_sublist_t *mls; mls = multilist_sublist_lock(ml, i); multilist_sublist_insert_tail(mls, markers[i]); multilist_sublist_unlock(mls); } /* * While we haven't hit our target number of bytes to evict, or * we're evicting all available buffers. */ while (total_evicted < bytes) { int sublist_idx = multilist_get_random_index(ml); uint64_t scan_evicted = 0; /* * Try to reduce pinned dnodes with a floor of arc_dnode_limit. * Request that 10% of the LRUs be scanned by the superblock * shrinker. */ if (type == ARC_BUFC_DATA && aggsum_compare( &arc_sums.arcstat_dnode_size, arc_dnode_size_limit) > 0) { arc_prune_async((aggsum_upper_bound( &arc_sums.arcstat_dnode_size) - arc_dnode_size_limit) / sizeof (dnode_t) / zfs_arc_dnode_reduce_percent); } /* * Start eviction using a randomly selected sublist, * this is to try and evenly balance eviction across all * sublists. Always starting at the same sublist * (e.g. index 0) would cause evictions to favor certain * sublists over others. */ for (int i = 0; i < num_sublists; i++) { uint64_t bytes_remaining; uint64_t bytes_evicted; if (total_evicted < bytes) bytes_remaining = bytes - total_evicted; else break; bytes_evicted = arc_evict_state_impl(ml, sublist_idx, markers[sublist_idx], spa, bytes_remaining); scan_evicted += bytes_evicted; total_evicted += bytes_evicted; /* we've reached the end, wrap to the beginning */ if (++sublist_idx >= num_sublists) sublist_idx = 0; } /* * If we didn't evict anything during this scan, we have * no reason to believe we'll evict more during another * scan, so break the loop. */ if (scan_evicted == 0) { /* This isn't possible, let's make that obvious */ ASSERT3S(bytes, !=, 0); /* * When bytes is ARC_EVICT_ALL, the only way to * break the loop is when scan_evicted is zero. * In that case, we actually have evicted enough, * so we don't want to increment the kstat. */ if (bytes != ARC_EVICT_ALL) { ASSERT3S(total_evicted, <, bytes); ARCSTAT_BUMP(arcstat_evict_not_enough); } break; } } for (int i = 0; i < num_sublists; i++) { multilist_sublist_t *mls = multilist_sublist_lock(ml, i); multilist_sublist_remove(mls, markers[i]); multilist_sublist_unlock(mls); } if (markers != arc_state_evict_markers) arc_state_free_markers(markers, num_sublists); return (total_evicted); } /* * Flush all "evictable" data of the given type from the arc state * specified. This will not evict any "active" buffers (i.e. referenced). * * When 'retry' is set to B_FALSE, the function will make a single pass * over the state and evict any buffers that it can. Since it doesn't * continually retry the eviction, it might end up leaving some buffers * in the ARC due to lock misses. * * When 'retry' is set to B_TRUE, the function will continually retry the * eviction until *all* evictable buffers have been removed from the * state. As a result, if concurrent insertions into the state are * allowed (e.g. if the ARC isn't shutting down), this function might * wind up in an infinite loop, continually trying to evict buffers. */ static uint64_t arc_flush_state(arc_state_t *state, uint64_t spa, arc_buf_contents_t type, boolean_t retry) { uint64_t evicted = 0; while (zfs_refcount_count(&state->arcs_esize[type]) != 0) { evicted += arc_evict_state(state, spa, ARC_EVICT_ALL, type); if (!retry) break; } return (evicted); } /* * Evict the specified number of bytes from the state specified, * restricting eviction to the spa and type given. This function * prevents us from trying to evict more from a state's list than * is "evictable", and to skip evicting altogether when passed a * negative value for "bytes". In contrast, arc_evict_state() will * evict everything it can, when passed a negative value for "bytes". */ static uint64_t arc_evict_impl(arc_state_t *state, uint64_t spa, int64_t bytes, arc_buf_contents_t type) { uint64_t delta; if (bytes > 0 && zfs_refcount_count(&state->arcs_esize[type]) > 0) { delta = MIN(zfs_refcount_count(&state->arcs_esize[type]), bytes); return (arc_evict_state(state, spa, delta, type)); } return (0); } /* * The goal of this function is to evict enough meta data buffers from the * ARC in order to enforce the arc_meta_limit. Achieving this is slightly * more complicated than it appears because it is common for data buffers * to have holds on meta data buffers. In addition, dnode meta data buffers * will be held by the dnodes in the block preventing them from being freed. * This means we can't simply traverse the ARC and expect to always find * enough unheld meta data buffer to release. * * Therefore, this function has been updated to make alternating passes * over the ARC releasing data buffers and then newly unheld meta data * buffers. This ensures forward progress is maintained and meta_used * will decrease. Normally this is sufficient, but if required the ARC * will call the registered prune callbacks causing dentry and inodes to * be dropped from the VFS cache. This will make dnode meta data buffers * available for reclaim. */ static uint64_t arc_evict_meta_balanced(uint64_t meta_used) { int64_t delta, adjustmnt; uint64_t total_evicted = 0, prune = 0; arc_buf_contents_t type = ARC_BUFC_DATA; uint_t restarts = zfs_arc_meta_adjust_restarts; restart: /* * This slightly differs than the way we evict from the mru in * arc_evict because we don't have a "target" value (i.e. no * "meta" arc_p). As a result, I think we can completely * cannibalize the metadata in the MRU before we evict the * metadata from the MFU. I think we probably need to implement a * "metadata arc_p" value to do this properly. */ adjustmnt = meta_used - arc_meta_limit; if (adjustmnt > 0 && zfs_refcount_count(&arc_mru->arcs_esize[type]) > 0) { delta = MIN(zfs_refcount_count(&arc_mru->arcs_esize[type]), adjustmnt); total_evicted += arc_evict_impl(arc_mru, 0, delta, type); adjustmnt -= delta; } /* * We can't afford to recalculate adjustmnt here. If we do, * new metadata buffers can sneak into the MRU or ANON lists, * thus penalize the MFU metadata. Although the fudge factor is * small, it has been empirically shown to be significant for * certain workloads (e.g. creating many empty directories). As * such, we use the original calculation for adjustmnt, and * simply decrement the amount of data evicted from the MRU. */ if (adjustmnt > 0 && zfs_refcount_count(&arc_mfu->arcs_esize[type]) > 0) { delta = MIN(zfs_refcount_count(&arc_mfu->arcs_esize[type]), adjustmnt); total_evicted += arc_evict_impl(arc_mfu, 0, delta, type); } adjustmnt = meta_used - arc_meta_limit; if (adjustmnt > 0 && zfs_refcount_count(&arc_mru_ghost->arcs_esize[type]) > 0) { delta = MIN(adjustmnt, zfs_refcount_count(&arc_mru_ghost->arcs_esize[type])); total_evicted += arc_evict_impl(arc_mru_ghost, 0, delta, type); adjustmnt -= delta; } if (adjustmnt > 0 && zfs_refcount_count(&arc_mfu_ghost->arcs_esize[type]) > 0) { delta = MIN(adjustmnt, zfs_refcount_count(&arc_mfu_ghost->arcs_esize[type])); total_evicted += arc_evict_impl(arc_mfu_ghost, 0, delta, type); } /* * If after attempting to make the requested adjustment to the ARC * the meta limit is still being exceeded then request that the * higher layers drop some cached objects which have holds on ARC * meta buffers. Requests to the upper layers will be made with * increasingly large scan sizes until the ARC is below the limit. */ if (meta_used > arc_meta_limit) { if (type == ARC_BUFC_DATA) { type = ARC_BUFC_METADATA; } else { type = ARC_BUFC_DATA; if (zfs_arc_meta_prune) { prune += zfs_arc_meta_prune; arc_prune_async(prune); } } if (restarts > 0) { restarts--; goto restart; } } return (total_evicted); } /* * Evict metadata buffers from the cache, such that arcstat_meta_used is * capped by the arc_meta_limit tunable. */ static uint64_t arc_evict_meta_only(uint64_t meta_used) { uint64_t total_evicted = 0; int64_t target; /* * If we're over the meta limit, we want to evict enough * metadata to get back under the meta limit. We don't want to * evict so much that we drop the MRU below arc_p, though. If * we're over the meta limit more than we're over arc_p, we * evict some from the MRU here, and some from the MFU below. */ target = MIN((int64_t)(meta_used - arc_meta_limit), (int64_t)(zfs_refcount_count(&arc_anon->arcs_size) + zfs_refcount_count(&arc_mru->arcs_size) - arc_p)); total_evicted += arc_evict_impl(arc_mru, 0, target, ARC_BUFC_METADATA); /* * Similar to the above, we want to evict enough bytes to get us * below the meta limit, but not so much as to drop us below the * space allotted to the MFU (which is defined as arc_c - arc_p). */ target = MIN((int64_t)(meta_used - arc_meta_limit), (int64_t)(zfs_refcount_count(&arc_mfu->arcs_size) - (arc_c - arc_p))); total_evicted += arc_evict_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); return (total_evicted); } static uint64_t arc_evict_meta(uint64_t meta_used) { if (zfs_arc_meta_strategy == ARC_STRATEGY_META_ONLY) return (arc_evict_meta_only(meta_used)); else return (arc_evict_meta_balanced(meta_used)); } /* * Return the type of the oldest buffer in the given arc state * * This function will select a random sublist of type ARC_BUFC_DATA and * a random sublist of type ARC_BUFC_METADATA. The tail of each sublist * is compared, and the type which contains the "older" buffer will be * returned. */ static arc_buf_contents_t arc_evict_type(arc_state_t *state) { multilist_t *data_ml = &state->arcs_list[ARC_BUFC_DATA]; multilist_t *meta_ml = &state->arcs_list[ARC_BUFC_METADATA]; int data_idx = multilist_get_random_index(data_ml); int meta_idx = multilist_get_random_index(meta_ml); multilist_sublist_t *data_mls; multilist_sublist_t *meta_mls; arc_buf_contents_t type; arc_buf_hdr_t *data_hdr; arc_buf_hdr_t *meta_hdr; /* * We keep the sublist lock until we're finished, to prevent * the headers from being destroyed via arc_evict_state(). */ data_mls = multilist_sublist_lock(data_ml, data_idx); meta_mls = multilist_sublist_lock(meta_ml, meta_idx); /* * These two loops are to ensure we skip any markers that * might be at the tail of the lists due to arc_evict_state(). */ for (data_hdr = multilist_sublist_tail(data_mls); data_hdr != NULL; data_hdr = multilist_sublist_prev(data_mls, data_hdr)) { if (data_hdr->b_spa != 0) break; } for (meta_hdr = multilist_sublist_tail(meta_mls); meta_hdr != NULL; meta_hdr = multilist_sublist_prev(meta_mls, meta_hdr)) { if (meta_hdr->b_spa != 0) break; } if (data_hdr == NULL && meta_hdr == NULL) { type = ARC_BUFC_DATA; } else if (data_hdr == NULL) { ASSERT3P(meta_hdr, !=, NULL); type = ARC_BUFC_METADATA; } else if (meta_hdr == NULL) { ASSERT3P(data_hdr, !=, NULL); type = ARC_BUFC_DATA; } else { ASSERT3P(data_hdr, !=, NULL); ASSERT3P(meta_hdr, !=, NULL); /* The headers can't be on the sublist without an L1 header */ ASSERT(HDR_HAS_L1HDR(data_hdr)); ASSERT(HDR_HAS_L1HDR(meta_hdr)); if (data_hdr->b_l1hdr.b_arc_access < meta_hdr->b_l1hdr.b_arc_access) { type = ARC_BUFC_DATA; } else { type = ARC_BUFC_METADATA; } } multilist_sublist_unlock(meta_mls); multilist_sublist_unlock(data_mls); return (type); } /* * Evict buffers from the cache, such that arcstat_size is capped by arc_c. */ static uint64_t arc_evict(void) { uint64_t total_evicted = 0; uint64_t bytes; int64_t target; uint64_t asize = aggsum_value(&arc_sums.arcstat_size); uint64_t ameta = aggsum_value(&arc_sums.arcstat_meta_used); /* * If we're over arc_meta_limit, we want to correct that before * potentially evicting data buffers below. */ total_evicted += arc_evict_meta(ameta); /* * Adjust MRU size * * If we're over the target cache size, we want to evict enough * from the list to get back to our target size. We don't want * to evict too much from the MRU, such that it drops below * arc_p. So, if we're over our target cache size more than * the MRU is over arc_p, we'll evict enough to get back to * arc_p here, and then evict more from the MFU below. */ target = MIN((int64_t)(asize - arc_c), (int64_t)(zfs_refcount_count(&arc_anon->arcs_size) + zfs_refcount_count(&arc_mru->arcs_size) + ameta - arc_p)); /* * If we're below arc_meta_min, always prefer to evict data. * Otherwise, try to satisfy the requested number of bytes to * evict from the type which contains older buffers; in an * effort to keep newer buffers in the cache regardless of their * type. If we cannot satisfy the number of bytes from this * type, spill over into the next type. */ if (arc_evict_type(arc_mru) == ARC_BUFC_METADATA && ameta > arc_meta_min) { bytes = arc_evict_impl(arc_mru, 0, target, ARC_BUFC_METADATA); total_evicted += bytes; /* * If we couldn't evict our target number of bytes from * metadata, we try to get the rest from data. */ target -= bytes; total_evicted += arc_evict_impl(arc_mru, 0, target, ARC_BUFC_DATA); } else { bytes = arc_evict_impl(arc_mru, 0, target, ARC_BUFC_DATA); total_evicted += bytes; /* * If we couldn't evict our target number of bytes from * data, we try to get the rest from metadata. */ target -= bytes; total_evicted += arc_evict_impl(arc_mru, 0, target, ARC_BUFC_METADATA); } /* * Re-sum ARC stats after the first round of evictions. */ asize = aggsum_value(&arc_sums.arcstat_size); ameta = aggsum_value(&arc_sums.arcstat_meta_used); /* * Adjust MFU size * * Now that we've tried to evict enough from the MRU to get its * size back to arc_p, if we're still above the target cache * size, we evict the rest from the MFU. */ target = asize - arc_c; if (arc_evict_type(arc_mfu) == ARC_BUFC_METADATA && ameta > arc_meta_min) { bytes = arc_evict_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); total_evicted += bytes; /* * If we couldn't evict our target number of bytes from * metadata, we try to get the rest from data. */ target -= bytes; total_evicted += arc_evict_impl(arc_mfu, 0, target, ARC_BUFC_DATA); } else { bytes = arc_evict_impl(arc_mfu, 0, target, ARC_BUFC_DATA); total_evicted += bytes; /* * If we couldn't evict our target number of bytes from * data, we try to get the rest from data. */ target -= bytes; total_evicted += arc_evict_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); } /* * Adjust ghost lists * * In addition to the above, the ARC also defines target values * for the ghost lists. The sum of the mru list and mru ghost * list should never exceed the target size of the cache, and * the sum of the mru list, mfu list, mru ghost list, and mfu * ghost list should never exceed twice the target size of the * cache. The following logic enforces these limits on the ghost * caches, and evicts from them as needed. */ target = zfs_refcount_count(&arc_mru->arcs_size) + zfs_refcount_count(&arc_mru_ghost->arcs_size) - arc_c; bytes = arc_evict_impl(arc_mru_ghost, 0, target, ARC_BUFC_DATA); total_evicted += bytes; target -= bytes; total_evicted += arc_evict_impl(arc_mru_ghost, 0, target, ARC_BUFC_METADATA); /* * We assume the sum of the mru list and mfu list is less than * or equal to arc_c (we enforced this above), which means we * can use the simpler of the two equations below: * * mru + mfu + mru ghost + mfu ghost <= 2 * arc_c * mru ghost + mfu ghost <= arc_c */ target = zfs_refcount_count(&arc_mru_ghost->arcs_size) + zfs_refcount_count(&arc_mfu_ghost->arcs_size) - arc_c; bytes = arc_evict_impl(arc_mfu_ghost, 0, target, ARC_BUFC_DATA); total_evicted += bytes; target -= bytes; total_evicted += arc_evict_impl(arc_mfu_ghost, 0, target, ARC_BUFC_METADATA); return (total_evicted); } void arc_flush(spa_t *spa, boolean_t retry) { uint64_t guid = 0; /* * If retry is B_TRUE, a spa must not be specified since we have * no good way to determine if all of a spa's buffers have been * evicted from an arc state. */ ASSERT(!retry || spa == 0); if (spa != NULL) guid = spa_load_guid(spa); (void) arc_flush_state(arc_mru, guid, ARC_BUFC_DATA, retry); (void) arc_flush_state(arc_mru, guid, ARC_BUFC_METADATA, retry); (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_DATA, retry); (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_METADATA, retry); (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_DATA, retry); (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_METADATA, retry); (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_DATA, retry); (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_METADATA, retry); } void arc_reduce_target_size(int64_t to_free) { uint64_t asize = aggsum_value(&arc_sums.arcstat_size); /* * All callers want the ARC to actually evict (at least) this much * memory. Therefore we reduce from the lower of the current size and * the target size. This way, even if arc_c is much higher than * arc_size (as can be the case after many calls to arc_freed(), we will * immediately have arc_c < arc_size and therefore the arc_evict_zthr * will evict. */ uint64_t c = MIN(arc_c, asize); if (c > to_free && c - to_free > arc_c_min) { arc_c = c - to_free; atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift)); if (arc_p > arc_c) arc_p = (arc_c >> 1); ASSERT(arc_c >= arc_c_min); ASSERT((int64_t)arc_p >= 0); } else { arc_c = arc_c_min; } if (asize > arc_c) { /* See comment in arc_evict_cb_check() on why lock+flag */ mutex_enter(&arc_evict_lock); arc_evict_needed = B_TRUE; mutex_exit(&arc_evict_lock); zthr_wakeup(arc_evict_zthr); } } /* * Determine if the system is under memory pressure and is asking * to reclaim memory. A return value of B_TRUE indicates that the system * is under memory pressure and that the arc should adjust accordingly. */ boolean_t arc_reclaim_needed(void) { return (arc_available_memory() < 0); } void arc_kmem_reap_soon(void) { size_t i; kmem_cache_t *prev_cache = NULL; kmem_cache_t *prev_data_cache = NULL; #ifdef _KERNEL if ((aggsum_compare(&arc_sums.arcstat_meta_used, arc_meta_limit) >= 0) && zfs_arc_meta_prune) { /* * We are exceeding our meta-data cache limit. * Prune some entries to release holds on meta-data. */ arc_prune_async(zfs_arc_meta_prune); } #if defined(_ILP32) /* * Reclaim unused memory from all kmem caches. */ kmem_reap(); #endif #endif for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) { #if defined(_ILP32) /* reach upper limit of cache size on 32-bit */ if (zio_buf_cache[i] == NULL) break; #endif if (zio_buf_cache[i] != prev_cache) { prev_cache = zio_buf_cache[i]; kmem_cache_reap_now(zio_buf_cache[i]); } if (zio_data_buf_cache[i] != prev_data_cache) { prev_data_cache = zio_data_buf_cache[i]; kmem_cache_reap_now(zio_data_buf_cache[i]); } } kmem_cache_reap_now(buf_cache); kmem_cache_reap_now(hdr_full_cache); kmem_cache_reap_now(hdr_l2only_cache); kmem_cache_reap_now(zfs_btree_leaf_cache); abd_cache_reap_now(); } static boolean_t arc_evict_cb_check(void *arg, zthr_t *zthr) { (void) arg, (void) zthr; #ifdef ZFS_DEBUG /* * This is necessary in order to keep the kstat information * up to date for tools that display kstat data such as the * mdb ::arc dcmd and the Linux crash utility. These tools * typically do not call kstat's update function, but simply * dump out stats from the most recent update. Without * this call, these commands may show stale stats for the * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even * with this call, the data might be out of date if the * evict thread hasn't been woken recently; but that should * suffice. The arc_state_t structures can be queried * directly if more accurate information is needed. */ if (arc_ksp != NULL) arc_ksp->ks_update(arc_ksp, KSTAT_READ); #endif /* * We have to rely on arc_wait_for_eviction() to tell us when to * evict, rather than checking if we are overflowing here, so that we * are sure to not leave arc_wait_for_eviction() waiting on aew_cv. * If we have become "not overflowing" since arc_wait_for_eviction() * checked, we need to wake it up. We could broadcast the CV here, * but arc_wait_for_eviction() may have not yet gone to sleep. We * would need to use a mutex to ensure that this function doesn't * broadcast until arc_wait_for_eviction() has gone to sleep (e.g. * the arc_evict_lock). However, the lock ordering of such a lock * would necessarily be incorrect with respect to the zthr_lock, * which is held before this function is called, and is held by * arc_wait_for_eviction() when it calls zthr_wakeup(). */ return (arc_evict_needed); } /* * Keep arc_size under arc_c by running arc_evict which evicts data * from the ARC. */ static void arc_evict_cb(void *arg, zthr_t *zthr) { (void) arg, (void) zthr; uint64_t evicted = 0; fstrans_cookie_t cookie = spl_fstrans_mark(); /* Evict from cache */ evicted = arc_evict(); /* * If evicted is zero, we couldn't evict anything * via arc_evict(). This could be due to hash lock * collisions, but more likely due to the majority of * arc buffers being unevictable. Therefore, even if * arc_size is above arc_c, another pass is unlikely to * be helpful and could potentially cause us to enter an * infinite loop. Additionally, zthr_iscancelled() is * checked here so that if the arc is shutting down, the * broadcast will wake any remaining arc evict waiters. */ mutex_enter(&arc_evict_lock); arc_evict_needed = !zthr_iscancelled(arc_evict_zthr) && evicted > 0 && aggsum_compare(&arc_sums.arcstat_size, arc_c) > 0; if (!arc_evict_needed) { /* * We're either no longer overflowing, or we * can't evict anything more, so we should wake * arc_get_data_impl() sooner. */ arc_evict_waiter_t *aw; while ((aw = list_remove_head(&arc_evict_waiters)) != NULL) { cv_broadcast(&aw->aew_cv); } arc_set_need_free(); } mutex_exit(&arc_evict_lock); spl_fstrans_unmark(cookie); } static boolean_t arc_reap_cb_check(void *arg, zthr_t *zthr) { (void) arg, (void) zthr; int64_t free_memory = arc_available_memory(); static int reap_cb_check_counter = 0; /* * If a kmem reap is already active, don't schedule more. We must * check for this because kmem_cache_reap_soon() won't actually * block on the cache being reaped (this is to prevent callers from * becoming implicitly blocked by a system-wide kmem reap -- which, * on a system with many, many full magazines, can take minutes). */ if (!kmem_cache_reap_active() && free_memory < 0) { arc_no_grow = B_TRUE; arc_warm = B_TRUE; /* * Wait at least zfs_grow_retry (default 5) seconds * before considering growing. */ arc_growtime = gethrtime() + SEC2NSEC(arc_grow_retry); return (B_TRUE); } else if (free_memory < arc_c >> arc_no_grow_shift) { arc_no_grow = B_TRUE; } else if (gethrtime() >= arc_growtime) { arc_no_grow = B_FALSE; } /* * Called unconditionally every 60 seconds to reclaim unused * zstd compression and decompression context. This is done * here to avoid the need for an independent thread. */ if (!((reap_cb_check_counter++) % 60)) zfs_zstd_cache_reap_now(); return (B_FALSE); } /* * Keep enough free memory in the system by reaping the ARC's kmem * caches. To cause more slabs to be reapable, we may reduce the * target size of the cache (arc_c), causing the arc_evict_cb() * to free more buffers. */ static void arc_reap_cb(void *arg, zthr_t *zthr) { (void) arg, (void) zthr; int64_t free_memory; fstrans_cookie_t cookie = spl_fstrans_mark(); /* * Kick off asynchronous kmem_reap()'s of all our caches. */ arc_kmem_reap_soon(); /* * Wait at least arc_kmem_cache_reap_retry_ms between * arc_kmem_reap_soon() calls. Without this check it is possible to * end up in a situation where we spend lots of time reaping * caches, while we're near arc_c_min. Waiting here also gives the * subsequent free memory check a chance of finding that the * asynchronous reap has already freed enough memory, and we don't * need to call arc_reduce_target_size(). */ delay((hz * arc_kmem_cache_reap_retry_ms + 999) / 1000); /* * Reduce the target size as needed to maintain the amount of free * memory in the system at a fraction of the arc_size (1/128th by * default). If oversubscribed (free_memory < 0) then reduce the * target arc_size by the deficit amount plus the fractional * amount. If free memory is positive but less than the fractional * amount, reduce by what is needed to hit the fractional amount. */ free_memory = arc_available_memory(); int64_t can_free = arc_c - arc_c_min; if (can_free > 0) { int64_t to_free = (can_free >> arc_shrink_shift) - free_memory; if (to_free > 0) arc_reduce_target_size(to_free); } spl_fstrans_unmark(cookie); } #ifdef _KERNEL /* * Determine the amount of memory eligible for eviction contained in the * ARC. All clean data reported by the ghost lists can always be safely * evicted. Due to arc_c_min, the same does not hold for all clean data * contained by the regular mru and mfu lists. * * In the case of the regular mru and mfu lists, we need to report as * much clean data as possible, such that evicting that same reported * data will not bring arc_size below arc_c_min. Thus, in certain * circumstances, the total amount of clean data in the mru and mfu * lists might not actually be evictable. * * The following two distinct cases are accounted for: * * 1. The sum of the amount of dirty data contained by both the mru and * mfu lists, plus the ARC's other accounting (e.g. the anon list), * is greater than or equal to arc_c_min. * (i.e. amount of dirty data >= arc_c_min) * * This is the easy case; all clean data contained by the mru and mfu * lists is evictable. Evicting all clean data can only drop arc_size * to the amount of dirty data, which is greater than arc_c_min. * * 2. The sum of the amount of dirty data contained by both the mru and * mfu lists, plus the ARC's other accounting (e.g. the anon list), * is less than arc_c_min. * (i.e. arc_c_min > amount of dirty data) * * 2.1. arc_size is greater than or equal arc_c_min. * (i.e. arc_size >= arc_c_min > amount of dirty data) * * In this case, not all clean data from the regular mru and mfu * lists is actually evictable; we must leave enough clean data * to keep arc_size above arc_c_min. Thus, the maximum amount of * evictable data from the two lists combined, is exactly the * difference between arc_size and arc_c_min. * * 2.2. arc_size is less than arc_c_min * (i.e. arc_c_min > arc_size > amount of dirty data) * * In this case, none of the data contained in the mru and mfu * lists is evictable, even if it's clean. Since arc_size is * already below arc_c_min, evicting any more would only * increase this negative difference. */ #endif /* _KERNEL */ /* * Adapt arc info given the number of bytes we are trying to add and * the state that we are coming from. This function is only called * when we are adding new content to the cache. */ static void arc_adapt(int bytes, arc_state_t *state) { int mult; uint64_t arc_p_min = (arc_c >> arc_p_min_shift); int64_t mrug_size = zfs_refcount_count(&arc_mru_ghost->arcs_size); int64_t mfug_size = zfs_refcount_count(&arc_mfu_ghost->arcs_size); ASSERT(bytes > 0); /* * Adapt the target size of the MRU list: * - if we just hit in the MRU ghost list, then increase * the target size of the MRU list. * - if we just hit in the MFU ghost list, then increase * the target size of the MFU list by decreasing the * target size of the MRU list. */ if (state == arc_mru_ghost) { mult = (mrug_size >= mfug_size) ? 1 : (mfug_size / mrug_size); if (!zfs_arc_p_dampener_disable) mult = MIN(mult, 10); /* avoid wild arc_p adjustment */ arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult); } else if (state == arc_mfu_ghost) { uint64_t delta; mult = (mfug_size >= mrug_size) ? 1 : (mrug_size / mfug_size); if (!zfs_arc_p_dampener_disable) mult = MIN(mult, 10); delta = MIN(bytes * mult, arc_p); arc_p = MAX(arc_p_min, arc_p - delta); } ASSERT((int64_t)arc_p >= 0); /* * Wake reap thread if we do not have any available memory */ if (arc_reclaim_needed()) { zthr_wakeup(arc_reap_zthr); return; } if (arc_no_grow) return; if (arc_c >= arc_c_max) return; /* * If we're within (2 * maxblocksize) bytes of the target * cache size, increment the target cache size */ ASSERT3U(arc_c, >=, 2ULL << SPA_MAXBLOCKSHIFT); if (aggsum_upper_bound(&arc_sums.arcstat_size) >= arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) { atomic_add_64(&arc_c, (int64_t)bytes); if (arc_c > arc_c_max) arc_c = arc_c_max; else if (state == arc_anon) atomic_add_64(&arc_p, (int64_t)bytes); if (arc_p > arc_c) arc_p = arc_c; } ASSERT((int64_t)arc_p >= 0); } /* * Check if arc_size has grown past our upper threshold, determined by * zfs_arc_overflow_shift. */ static arc_ovf_level_t arc_is_overflowing(boolean_t use_reserve) { /* Always allow at least one block of overflow */ int64_t overflow = MAX(SPA_MAXBLOCKSIZE, arc_c >> zfs_arc_overflow_shift); /* * We just compare the lower bound here for performance reasons. Our * primary goals are to make sure that the arc never grows without * bound, and that it can reach its maximum size. This check * accomplishes both goals. The maximum amount we could run over by is * 2 * aggsum_borrow_multiplier * NUM_CPUS * the average size of a block * in the ARC. In practice, that's in the tens of MB, which is low * enough to be safe. */ int64_t over = aggsum_lower_bound(&arc_sums.arcstat_size) - arc_c - overflow / 2; if (!use_reserve) overflow /= 2; return (over < 0 ? ARC_OVF_NONE : over < overflow ? ARC_OVF_SOME : ARC_OVF_SEVERE); } static abd_t * arc_get_data_abd(arc_buf_hdr_t *hdr, uint64_t size, const void *tag, int alloc_flags) { arc_buf_contents_t type = arc_buf_type(hdr); arc_get_data_impl(hdr, size, tag, alloc_flags); if (type == ARC_BUFC_METADATA) { return (abd_alloc(size, B_TRUE)); } else { ASSERT(type == ARC_BUFC_DATA); return (abd_alloc(size, B_FALSE)); } } static void * arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, const void *tag) { arc_buf_contents_t type = arc_buf_type(hdr); arc_get_data_impl(hdr, size, tag, ARC_HDR_DO_ADAPT); if (type == ARC_BUFC_METADATA) { return (zio_buf_alloc(size)); } else { ASSERT(type == ARC_BUFC_DATA); return (zio_data_buf_alloc(size)); } } /* * Wait for the specified amount of data (in bytes) to be evicted from the * ARC, and for there to be sufficient free memory in the system. Waiting for * eviction ensures that the memory used by the ARC decreases. Waiting for * free memory ensures that the system won't run out of free pages, regardless * of ARC behavior and settings. See arc_lowmem_init(). */ void arc_wait_for_eviction(uint64_t amount, boolean_t use_reserve) { switch (arc_is_overflowing(use_reserve)) { case ARC_OVF_NONE: return; case ARC_OVF_SOME: /* * This is a bit racy without taking arc_evict_lock, but the * worst that can happen is we either call zthr_wakeup() extra * time due to race with other thread here, or the set flag * get cleared by arc_evict_cb(), which is unlikely due to * big hysteresis, but also not important since at this level * of overflow the eviction is purely advisory. Same time * taking the global lock here every time without waiting for * the actual eviction creates a significant lock contention. */ if (!arc_evict_needed) { arc_evict_needed = B_TRUE; zthr_wakeup(arc_evict_zthr); } return; case ARC_OVF_SEVERE: default: { arc_evict_waiter_t aw; list_link_init(&aw.aew_node); cv_init(&aw.aew_cv, NULL, CV_DEFAULT, NULL); uint64_t last_count = 0; mutex_enter(&arc_evict_lock); if (!list_is_empty(&arc_evict_waiters)) { arc_evict_waiter_t *last = list_tail(&arc_evict_waiters); last_count = last->aew_count; } else if (!arc_evict_needed) { arc_evict_needed = B_TRUE; zthr_wakeup(arc_evict_zthr); } /* * Note, the last waiter's count may be less than * arc_evict_count if we are low on memory in which * case arc_evict_state_impl() may have deferred * wakeups (but still incremented arc_evict_count). */ aw.aew_count = MAX(last_count, arc_evict_count) + amount; list_insert_tail(&arc_evict_waiters, &aw); arc_set_need_free(); DTRACE_PROBE3(arc__wait__for__eviction, uint64_t, amount, uint64_t, arc_evict_count, uint64_t, aw.aew_count); /* * We will be woken up either when arc_evict_count reaches * aew_count, or when the ARC is no longer overflowing and * eviction completes. * In case of "false" wakeup, we will still be on the list. */ do { cv_wait(&aw.aew_cv, &arc_evict_lock); } while (list_link_active(&aw.aew_node)); mutex_exit(&arc_evict_lock); cv_destroy(&aw.aew_cv); } } } /* * Allocate a block and return it to the caller. If we are hitting the * hard limit for the cache size, we must sleep, waiting for the eviction * thread to catch up. If we're past the target size but below the hard * limit, we'll only signal the reclaim thread and continue on. */ static void arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, const void *tag, int alloc_flags) { arc_state_t *state = hdr->b_l1hdr.b_state; arc_buf_contents_t type = arc_buf_type(hdr); if (alloc_flags & ARC_HDR_DO_ADAPT) arc_adapt(size, state); /* * If arc_size is currently overflowing, we must be adding data * faster than we are evicting. To ensure we don't compound the * problem by adding more data and forcing arc_size to grow even * further past it's target size, we wait for the eviction thread to * make some progress. We also wait for there to be sufficient free * memory in the system, as measured by arc_free_memory(). * * Specifically, we wait for zfs_arc_eviction_pct percent of the * requested size to be evicted. This should be more than 100%, to * ensure that that progress is also made towards getting arc_size * under arc_c. See the comment above zfs_arc_eviction_pct. */ arc_wait_for_eviction(size * zfs_arc_eviction_pct / 100, alloc_flags & ARC_HDR_USE_RESERVE); VERIFY3U(hdr->b_type, ==, type); if (type == ARC_BUFC_METADATA) { arc_space_consume(size, ARC_SPACE_META); } else { arc_space_consume(size, ARC_SPACE_DATA); } /* * Update the state size. Note that ghost states have a * "ghost size" and so don't need to be updated. */ if (!GHOST_STATE(state)) { (void) zfs_refcount_add_many(&state->arcs_size, size, tag); /* * If this is reached via arc_read, the link is * protected by the hash lock. If reached via * arc_buf_alloc, the header should not be accessed by * any other thread. And, if reached via arc_read_done, * the hash lock will protect it if it's found in the * hash table; otherwise no other thread should be * trying to [add|remove]_reference it. */ if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); (void) zfs_refcount_add_many(&state->arcs_esize[type], size, tag); } /* * If we are growing the cache, and we are adding anonymous * data, and we have outgrown arc_p, update arc_p */ if (aggsum_upper_bound(&arc_sums.arcstat_size) < arc_c && hdr->b_l1hdr.b_state == arc_anon && (zfs_refcount_count(&arc_anon->arcs_size) + zfs_refcount_count(&arc_mru->arcs_size) > arc_p)) arc_p = MIN(arc_c, arc_p + size); } } static void arc_free_data_abd(arc_buf_hdr_t *hdr, abd_t *abd, uint64_t size, const void *tag) { arc_free_data_impl(hdr, size, tag); abd_free(abd); } static void arc_free_data_buf(arc_buf_hdr_t *hdr, void *buf, uint64_t size, const void *tag) { arc_buf_contents_t type = arc_buf_type(hdr); arc_free_data_impl(hdr, size, tag); if (type == ARC_BUFC_METADATA) { zio_buf_free(buf, size); } else { ASSERT(type == ARC_BUFC_DATA); zio_data_buf_free(buf, size); } } /* * Free the arc data buffer. */ static void arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, const void *tag) { arc_state_t *state = hdr->b_l1hdr.b_state; arc_buf_contents_t type = arc_buf_type(hdr); /* protected by hash lock, if in the hash table */ if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); ASSERT(state != arc_anon && state != arc_l2c_only); (void) zfs_refcount_remove_many(&state->arcs_esize[type], size, tag); } (void) zfs_refcount_remove_many(&state->arcs_size, size, tag); VERIFY3U(hdr->b_type, ==, type); if (type == ARC_BUFC_METADATA) { arc_space_return(size, ARC_SPACE_META); } else { ASSERT(type == ARC_BUFC_DATA); arc_space_return(size, ARC_SPACE_DATA); } } /* * This routine is called whenever a buffer is accessed. * NOTE: the hash lock is dropped in this function. */ static void arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) { clock_t now; ASSERT(MUTEX_HELD(hash_lock)); ASSERT(HDR_HAS_L1HDR(hdr)); if (hdr->b_l1hdr.b_state == arc_anon) { /* * This buffer is not in the cache, and does not * appear in our "ghost" list. Add the new buffer * to the MRU state. */ ASSERT0(hdr->b_l1hdr.b_arc_access); hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); arc_change_state(arc_mru, hdr, hash_lock); } else if (hdr->b_l1hdr.b_state == arc_mru) { now = ddi_get_lbolt(); /* * If this buffer is here because of a prefetch, then either: * - clear the flag if this is a "referencing" read * (any subsequent access will bump this into the MFU state). * or * - move the buffer to the head of the list if this is * another prefetch (to make it less likely to be evicted). */ if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) { if (zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { /* link protected by hash lock */ ASSERT(multilist_link_active( &hdr->b_l1hdr.b_arc_node)); } else { if (HDR_HAS_L2HDR(hdr)) l2arc_hdr_arcstats_decrement_state(hdr); arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH | ARC_FLAG_PRESCIENT_PREFETCH); hdr->b_l1hdr.b_mru_hits++; ARCSTAT_BUMP(arcstat_mru_hits); if (HDR_HAS_L2HDR(hdr)) l2arc_hdr_arcstats_increment_state(hdr); } hdr->b_l1hdr.b_arc_access = now; return; } /* * This buffer has been "accessed" only once so far, * but it is still in the cache. Move it to the MFU * state. */ if (ddi_time_after(now, hdr->b_l1hdr.b_arc_access + ARC_MINTIME)) { /* * More than 125ms have passed since we * instantiated this buffer. Move it to the * most frequently used state. */ hdr->b_l1hdr.b_arc_access = now; DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); arc_change_state(arc_mfu, hdr, hash_lock); } hdr->b_l1hdr.b_mru_hits++; ARCSTAT_BUMP(arcstat_mru_hits); } else if (hdr->b_l1hdr.b_state == arc_mru_ghost) { arc_state_t *new_state; /* * This buffer has been "accessed" recently, but * was evicted from the cache. Move it to the * MFU state. */ if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) { new_state = arc_mru; if (zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) > 0) { if (HDR_HAS_L2HDR(hdr)) l2arc_hdr_arcstats_decrement_state(hdr); arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH | ARC_FLAG_PRESCIENT_PREFETCH); if (HDR_HAS_L2HDR(hdr)) l2arc_hdr_arcstats_increment_state(hdr); } DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); } else { new_state = arc_mfu; DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); } hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); arc_change_state(new_state, hdr, hash_lock); hdr->b_l1hdr.b_mru_ghost_hits++; ARCSTAT_BUMP(arcstat_mru_ghost_hits); } else if (hdr->b_l1hdr.b_state == arc_mfu) { /* * This buffer has been accessed more than once and is * still in the cache. Keep it in the MFU state. * * NOTE: an add_reference() that occurred when we did * the arc_read() will have kicked this off the list. * If it was a prefetch, we will explicitly move it to * the head of the list now. */ hdr->b_l1hdr.b_mfu_hits++; ARCSTAT_BUMP(arcstat_mfu_hits); hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); } else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) { arc_state_t *new_state = arc_mfu; /* * This buffer has been accessed more than once but has * been evicted from the cache. Move it back to the * MFU state. */ if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) { /* * This is a prefetch access... * move this block back to the MRU state. */ new_state = arc_mru; } hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); arc_change_state(new_state, hdr, hash_lock); hdr->b_l1hdr.b_mfu_ghost_hits++; ARCSTAT_BUMP(arcstat_mfu_ghost_hits); } else if (hdr->b_l1hdr.b_state == arc_l2c_only) { /* * This buffer is on the 2nd Level ARC. */ hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); arc_change_state(arc_mfu, hdr, hash_lock); } else { cmn_err(CE_PANIC, "invalid arc state 0x%p", hdr->b_l1hdr.b_state); } } /* * This routine is called by dbuf_hold() to update the arc_access() state * which otherwise would be skipped for entries in the dbuf cache. */ void arc_buf_access(arc_buf_t *buf) { mutex_enter(&buf->b_evict_lock); arc_buf_hdr_t *hdr = buf->b_hdr; /* * Avoid taking the hash_lock when possible as an optimization. * The header must be checked again under the hash_lock in order * to handle the case where it is concurrently being released. */ if (hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY(hdr)) { mutex_exit(&buf->b_evict_lock); return; } kmutex_t *hash_lock = HDR_LOCK(hdr); mutex_enter(hash_lock); if (hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY(hdr)) { mutex_exit(hash_lock); mutex_exit(&buf->b_evict_lock); ARCSTAT_BUMP(arcstat_access_skip); return; } mutex_exit(&buf->b_evict_lock); ASSERT(hdr->b_l1hdr.b_state == arc_mru || hdr->b_l1hdr.b_state == arc_mfu); DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); arc_access(hdr, hash_lock); mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_hits); ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr) && !HDR_PRESCIENT_PREFETCH(hdr), demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data, metadata, hits); } /* a generic arc_read_done_func_t which you can use */ void arc_bcopy_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, arc_buf_t *buf, void *arg) { (void) zio, (void) zb, (void) bp; if (buf == NULL) return; memcpy(arg, buf->b_data, arc_buf_size(buf)); arc_buf_destroy(buf, arg); } /* a generic arc_read_done_func_t */ void arc_getbuf_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, arc_buf_t *buf, void *arg) { (void) zb, (void) bp; arc_buf_t **bufp = arg; if (buf == NULL) { ASSERT(zio == NULL || zio->io_error != 0); *bufp = NULL; } else { ASSERT(zio == NULL || zio->io_error == 0); *bufp = buf; ASSERT(buf->b_data != NULL); } } static void arc_hdr_verify(arc_buf_hdr_t *hdr, blkptr_t *bp) { if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) { ASSERT3U(HDR_GET_PSIZE(hdr), ==, 0); ASSERT3U(arc_hdr_get_compress(hdr), ==, ZIO_COMPRESS_OFF); } else { if (HDR_COMPRESSION_ENABLED(hdr)) { ASSERT3U(arc_hdr_get_compress(hdr), ==, BP_GET_COMPRESS(bp)); } ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(bp)); ASSERT3U(HDR_GET_PSIZE(hdr), ==, BP_GET_PSIZE(bp)); ASSERT3U(!!HDR_PROTECTED(hdr), ==, BP_IS_PROTECTED(bp)); } } static void arc_read_done(zio_t *zio) { blkptr_t *bp = zio->io_bp; arc_buf_hdr_t *hdr = zio->io_private; kmutex_t *hash_lock = NULL; arc_callback_t *callback_list; arc_callback_t *acb; boolean_t freeable = B_FALSE; /* * The hdr was inserted into hash-table and removed from lists * prior to starting I/O. We should find this header, since * it's in the hash table, and it should be legit since it's * not possible to evict it during the I/O. The only possible * reason for it not to be found is if we were freed during the * read. */ if (HDR_IN_HASH_TABLE(hdr)) { arc_buf_hdr_t *found; ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp)); ASSERT3U(hdr->b_dva.dva_word[0], ==, BP_IDENTITY(zio->io_bp)->dva_word[0]); ASSERT3U(hdr->b_dva.dva_word[1], ==, BP_IDENTITY(zio->io_bp)->dva_word[1]); found = buf_hash_find(hdr->b_spa, zio->io_bp, &hash_lock); ASSERT((found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) || (found == hdr && HDR_L2_READING(hdr))); ASSERT3P(hash_lock, !=, NULL); } if (BP_IS_PROTECTED(bp)) { hdr->b_crypt_hdr.b_ot = BP_GET_TYPE(bp); hdr->b_crypt_hdr.b_dsobj = zio->io_bookmark.zb_objset; zio_crypt_decode_params_bp(bp, hdr->b_crypt_hdr.b_salt, hdr->b_crypt_hdr.b_iv); if (zio->io_error == 0) { if (BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG) { void *tmpbuf; tmpbuf = abd_borrow_buf_copy(zio->io_abd, sizeof (zil_chain_t)); zio_crypt_decode_mac_zil(tmpbuf, hdr->b_crypt_hdr.b_mac); abd_return_buf(zio->io_abd, tmpbuf, sizeof (zil_chain_t)); } else { zio_crypt_decode_mac_bp(bp, hdr->b_crypt_hdr.b_mac); } } } if (zio->io_error == 0) { /* byteswap if necessary */ if (BP_SHOULD_BYTESWAP(zio->io_bp)) { if (BP_GET_LEVEL(zio->io_bp) > 0) { hdr->b_l1hdr.b_byteswap = DMU_BSWAP_UINT64; } else { hdr->b_l1hdr.b_byteswap = DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp)); } } else { hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; } if (!HDR_L2_READING(hdr)) { hdr->b_complevel = zio->io_prop.zp_complevel; } } arc_hdr_clear_flags(hdr, ARC_FLAG_L2_EVICTED); if (l2arc_noprefetch && HDR_PREFETCH(hdr)) arc_hdr_clear_flags(hdr, ARC_FLAG_L2CACHE); callback_list = hdr->b_l1hdr.b_acb; ASSERT3P(callback_list, !=, NULL); if (hash_lock && zio->io_error == 0 && hdr->b_l1hdr.b_state == arc_anon) { /* * Only call arc_access on anonymous buffers. This is because * if we've issued an I/O for an evicted buffer, we've already * called arc_access (to prevent any simultaneous readers from * getting confused). */ arc_access(hdr, hash_lock); } /* * If a read request has a callback (i.e. acb_done is not NULL), then we * make a buf containing the data according to the parameters which were * passed in. The implementation of arc_buf_alloc_impl() ensures that we * aren't needlessly decompressing the data multiple times. */ int callback_cnt = 0; for (acb = callback_list; acb != NULL; acb = acb->acb_next) { if (!acb->acb_done || acb->acb_nobuf) continue; callback_cnt++; if (zio->io_error != 0) continue; int error = arc_buf_alloc_impl(hdr, zio->io_spa, &acb->acb_zb, acb->acb_private, acb->acb_encrypted, acb->acb_compressed, acb->acb_noauth, B_TRUE, &acb->acb_buf); /* * Assert non-speculative zios didn't fail because an * encryption key wasn't loaded */ ASSERT((zio->io_flags & ZIO_FLAG_SPECULATIVE) || error != EACCES); /* * If we failed to decrypt, report an error now (as the zio * layer would have done if it had done the transforms). */ if (error == ECKSUM) { ASSERT(BP_IS_PROTECTED(bp)); error = SET_ERROR(EIO); if ((zio->io_flags & ZIO_FLAG_SPECULATIVE) == 0) { spa_log_error(zio->io_spa, &acb->acb_zb); (void) zfs_ereport_post( FM_EREPORT_ZFS_AUTHENTICATION, zio->io_spa, NULL, &acb->acb_zb, zio, 0); } } if (error != 0) { /* * Decompression or decryption failed. Set * io_error so that when we call acb_done * (below), we will indicate that the read * failed. Note that in the unusual case * where one callback is compressed and another * uncompressed, we will mark all of them * as failed, even though the uncompressed * one can't actually fail. In this case, * the hdr will not be anonymous, because * if there are multiple callbacks, it's * because multiple threads found the same * arc buf in the hash table. */ zio->io_error = error; } } /* * If there are multiple callbacks, we must have the hash lock, * because the only way for multiple threads to find this hdr is * in the hash table. This ensures that if there are multiple * callbacks, the hdr is not anonymous. If it were anonymous, * we couldn't use arc_buf_destroy() in the error case below. */ ASSERT(callback_cnt < 2 || hash_lock != NULL); hdr->b_l1hdr.b_acb = NULL; arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); if (callback_cnt == 0) ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr)); ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt) || callback_list != NULL); if (zio->io_error == 0) { arc_hdr_verify(hdr, zio->io_bp); } else { arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR); if (hdr->b_l1hdr.b_state != arc_anon) arc_change_state(arc_anon, hdr, hash_lock); if (HDR_IN_HASH_TABLE(hdr)) buf_hash_remove(hdr); freeable = zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt); } /* * Broadcast before we drop the hash_lock to avoid the possibility * that the hdr (and hence the cv) might be freed before we get to * the cv_broadcast(). */ cv_broadcast(&hdr->b_l1hdr.b_cv); if (hash_lock != NULL) { mutex_exit(hash_lock); } else { /* * This block was freed while we waited for the read to * complete. It has been removed from the hash table and * moved to the anonymous state (so that it won't show up * in the cache). */ ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); freeable = zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt); } /* execute each callback and free its structure */ while ((acb = callback_list) != NULL) { if (acb->acb_done != NULL) { if (zio->io_error != 0 && acb->acb_buf != NULL) { /* * If arc_buf_alloc_impl() fails during * decompression, the buf will still be * allocated, and needs to be freed here. */ arc_buf_destroy(acb->acb_buf, acb->acb_private); acb->acb_buf = NULL; } acb->acb_done(zio, &zio->io_bookmark, zio->io_bp, acb->acb_buf, acb->acb_private); } if (acb->acb_zio_dummy != NULL) { acb->acb_zio_dummy->io_error = zio->io_error; zio_nowait(acb->acb_zio_dummy); } callback_list = acb->acb_next; kmem_free(acb, sizeof (arc_callback_t)); } if (freeable) arc_hdr_destroy(hdr); } /* * "Read" the block at the specified DVA (in bp) via the * cache. If the block is found in the cache, invoke the provided * callback immediately and return. Note that the `zio' parameter * in the callback will be NULL in this case, since no IO was * required. If the block is not in the cache pass the read request * on to the spa with a substitute callback function, so that the * requested block will be added to the cache. * * If a read request arrives for a block that has a read in-progress, * either wait for the in-progress read to complete (and return the * results); or, if this is a read with a "done" func, add a record * to the read to invoke the "done" func when the read completes, * and return; or just return. * * arc_read_done() will invoke all the requested "done" functions * for readers of this block. */ int arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_read_done_func_t *done, void *private, zio_priority_t priority, int zio_flags, arc_flags_t *arc_flags, const zbookmark_phys_t *zb) { arc_buf_hdr_t *hdr = NULL; kmutex_t *hash_lock = NULL; zio_t *rzio; uint64_t guid = spa_load_guid(spa); boolean_t compressed_read = (zio_flags & ZIO_FLAG_RAW_COMPRESS) != 0; boolean_t encrypted_read = BP_IS_ENCRYPTED(bp) && (zio_flags & ZIO_FLAG_RAW_ENCRYPT) != 0; boolean_t noauth_read = BP_IS_AUTHENTICATED(bp) && (zio_flags & ZIO_FLAG_RAW_ENCRYPT) != 0; boolean_t embedded_bp = !!BP_IS_EMBEDDED(bp); boolean_t no_buf = *arc_flags & ARC_FLAG_NO_BUF; int rc = 0; ASSERT(!embedded_bp || BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA); ASSERT(!BP_IS_HOLE(bp)); ASSERT(!BP_IS_REDACTED(bp)); /* * Normally SPL_FSTRANS will already be set since kernel threads which * expect to call the DMU interfaces will set it when created. System * calls are similarly handled by setting/cleaning the bit in the * registered callback (module/os/.../zfs/zpl_*). * * External consumers such as Lustre which call the exported DMU * interfaces may not have set SPL_FSTRANS. To avoid a deadlock * on the hash_lock always set and clear the bit. */ fstrans_cookie_t cookie = spl_fstrans_mark(); top: /* * Verify the block pointer contents are reasonable. This should * always be the case since the blkptr is protected by a checksum. * However, if there is damage it's desirable to detect this early * and treat it as a checksum error. This allows an alternate blkptr * to be tried when one is available (e.g. ditto blocks). */ if (!zfs_blkptr_verify(spa, bp, zio_flags & ZIO_FLAG_CONFIG_WRITER, BLK_VERIFY_LOG)) { rc = SET_ERROR(ECKSUM); goto out; } if (!embedded_bp) { /* * Embedded BP's have no DVA and require no I/O to "read". * Create an anonymous arc buf to back it. */ hdr = buf_hash_find(guid, bp, &hash_lock); } /* * Determine if we have an L1 cache hit or a cache miss. For simplicity * we maintain encrypted data separately from compressed / uncompressed * data. If the user is requesting raw encrypted data and we don't have * that in the header we will read from disk to guarantee that we can * get it even if the encryption keys aren't loaded. */ if (hdr != NULL && HDR_HAS_L1HDR(hdr) && (HDR_HAS_RABD(hdr) || (hdr->b_l1hdr.b_pabd != NULL && !encrypted_read))) { arc_buf_t *buf = NULL; *arc_flags |= ARC_FLAG_CACHED; if (HDR_IO_IN_PROGRESS(hdr)) { zio_t *head_zio = hdr->b_l1hdr.b_acb->acb_zio_head; if (*arc_flags & ARC_FLAG_CACHED_ONLY) { mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_cached_only_in_progress); rc = SET_ERROR(ENOENT); goto out; } ASSERT3P(head_zio, !=, NULL); if ((hdr->b_flags & ARC_FLAG_PRIO_ASYNC_READ) && priority == ZIO_PRIORITY_SYNC_READ) { /* * This is a sync read that needs to wait for * an in-flight async read. Request that the * zio have its priority upgraded. */ zio_change_priority(head_zio, priority); DTRACE_PROBE1(arc__async__upgrade__sync, arc_buf_hdr_t *, hdr); ARCSTAT_BUMP(arcstat_async_upgrade_sync); } if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) { arc_hdr_clear_flags(hdr, ARC_FLAG_PREDICTIVE_PREFETCH); } /* * If there are multiple threads reading the same block * and that block is not yet in the ARC, then only one * thread will do the physical I/O and all other * threads will wait until that I/O completes. * Synchronous reads use the b_cv whereas nowait reads * register a callback. Both are signalled/called in * arc_read_done. * * Errors of the physical I/O may need to be propagated * to the pio. For synchronous reads, we simply restart * this function and it will reassess. Nowait reads * attach the acb_zio_dummy zio to pio and * arc_read_done propagates the physical I/O's io_error * to acb_zio_dummy, and thereby to pio. */ if (*arc_flags & ARC_FLAG_WAIT) { cv_wait(&hdr->b_l1hdr.b_cv, hash_lock); mutex_exit(hash_lock); goto top; } ASSERT(*arc_flags & ARC_FLAG_NOWAIT); if (done) { arc_callback_t *acb = NULL; acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); acb->acb_done = done; acb->acb_private = private; acb->acb_compressed = compressed_read; acb->acb_encrypted = encrypted_read; acb->acb_noauth = noauth_read; acb->acb_nobuf = no_buf; acb->acb_zb = *zb; if (pio != NULL) acb->acb_zio_dummy = zio_null(pio, spa, NULL, NULL, NULL, zio_flags); ASSERT3P(acb->acb_done, !=, NULL); acb->acb_zio_head = head_zio; acb->acb_next = hdr->b_l1hdr.b_acb; hdr->b_l1hdr.b_acb = acb; } mutex_exit(hash_lock); goto out; } ASSERT(hdr->b_l1hdr.b_state == arc_mru || hdr->b_l1hdr.b_state == arc_mfu); if (done && !no_buf) { if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) { /* * This is a demand read which does not have to * wait for i/o because we did a predictive * prefetch i/o for it, which has completed. */ DTRACE_PROBE1( arc__demand__hit__predictive__prefetch, arc_buf_hdr_t *, hdr); ARCSTAT_BUMP( arcstat_demand_hit_predictive_prefetch); arc_hdr_clear_flags(hdr, ARC_FLAG_PREDICTIVE_PREFETCH); } if (hdr->b_flags & ARC_FLAG_PRESCIENT_PREFETCH) { ARCSTAT_BUMP( arcstat_demand_hit_prescient_prefetch); arc_hdr_clear_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH); } ASSERT(!embedded_bp || !BP_IS_HOLE(bp)); /* Get a buf with the desired data in it. */ rc = arc_buf_alloc_impl(hdr, spa, zb, private, encrypted_read, compressed_read, noauth_read, B_TRUE, &buf); if (rc == ECKSUM) { /* * Convert authentication and decryption errors * to EIO (and generate an ereport if needed) * before leaving the ARC. */ rc = SET_ERROR(EIO); if ((zio_flags & ZIO_FLAG_SPECULATIVE) == 0) { spa_log_error(spa, zb); (void) zfs_ereport_post( FM_EREPORT_ZFS_AUTHENTICATION, spa, NULL, zb, NULL, 0); } } if (rc != 0) { (void) remove_reference(hdr, hash_lock, private); arc_buf_destroy_impl(buf); buf = NULL; } /* assert any errors weren't due to unloaded keys */ ASSERT((zio_flags & ZIO_FLAG_SPECULATIVE) || rc != EACCES); } else if (*arc_flags & ARC_FLAG_PREFETCH && zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) { if (HDR_HAS_L2HDR(hdr)) l2arc_hdr_arcstats_decrement_state(hdr); arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); if (HDR_HAS_L2HDR(hdr)) l2arc_hdr_arcstats_increment_state(hdr); } DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); arc_access(hdr, hash_lock); if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH) arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH); if (*arc_flags & ARC_FLAG_L2CACHE) arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_hits); ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data, metadata, hits); if (done) done(NULL, zb, bp, buf, private); } else { uint64_t lsize = BP_GET_LSIZE(bp); uint64_t psize = BP_GET_PSIZE(bp); arc_callback_t *acb; vdev_t *vd = NULL; uint64_t addr = 0; boolean_t devw = B_FALSE; uint64_t size; abd_t *hdr_abd; int alloc_flags = encrypted_read ? ARC_HDR_ALLOC_RDATA : 0; if (*arc_flags & ARC_FLAG_CACHED_ONLY) { rc = SET_ERROR(ENOENT); if (hash_lock != NULL) mutex_exit(hash_lock); goto out; } if (hdr == NULL) { /* * This block is not in the cache or it has * embedded data. */ arc_buf_hdr_t *exists = NULL; arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, BP_IS_PROTECTED(bp), BP_GET_COMPRESS(bp), 0, type); if (!embedded_bp) { hdr->b_dva = *BP_IDENTITY(bp); hdr->b_birth = BP_PHYSICAL_BIRTH(bp); exists = buf_hash_insert(hdr, &hash_lock); } if (exists != NULL) { /* somebody beat us to the hash insert */ mutex_exit(hash_lock); buf_discard_identity(hdr); arc_hdr_destroy(hdr); goto top; /* restart the IO request */ } alloc_flags |= ARC_HDR_DO_ADAPT; } else { /* * This block is in the ghost cache or encrypted data * was requested and we didn't have it. If it was * L2-only (and thus didn't have an L1 hdr), * we realloc the header to add an L1 hdr. */ if (!HDR_HAS_L1HDR(hdr)) { hdr = arc_hdr_realloc(hdr, hdr_l2only_cache, hdr_full_cache); } if (GHOST_STATE(hdr->b_l1hdr.b_state)) { ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(!HDR_HAS_RABD(hdr)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT0(zfs_refcount_count( &hdr->b_l1hdr.b_refcnt)); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); } else if (HDR_IO_IN_PROGRESS(hdr)) { /* * If this header already had an IO in progress * and we are performing another IO to fetch * encrypted data we must wait until the first * IO completes so as not to confuse * arc_read_done(). This should be very rare * and so the performance impact shouldn't * matter. */ cv_wait(&hdr->b_l1hdr.b_cv, hash_lock); mutex_exit(hash_lock); goto top; } /* * This is a delicate dance that we play here. * This hdr might be in the ghost list so we access * it to move it out of the ghost list before we * initiate the read. If it's a prefetch then * it won't have a callback so we'll remove the * reference that arc_buf_alloc_impl() created. We * do this after we've called arc_access() to * avoid hitting an assert in remove_reference(). */ arc_adapt(arc_hdr_size(hdr), hdr->b_l1hdr.b_state); arc_access(hdr, hash_lock); } arc_hdr_alloc_abd(hdr, alloc_flags); if (encrypted_read) { ASSERT(HDR_HAS_RABD(hdr)); size = HDR_GET_PSIZE(hdr); hdr_abd = hdr->b_crypt_hdr.b_rabd; zio_flags |= ZIO_FLAG_RAW; } else { ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); size = arc_hdr_size(hdr); hdr_abd = hdr->b_l1hdr.b_pabd; if (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF) { zio_flags |= ZIO_FLAG_RAW_COMPRESS; } /* * For authenticated bp's, we do not ask the ZIO layer * to authenticate them since this will cause the entire * IO to fail if the key isn't loaded. Instead, we * defer authentication until arc_buf_fill(), which will * verify the data when the key is available. */ if (BP_IS_AUTHENTICATED(bp)) zio_flags |= ZIO_FLAG_RAW_ENCRYPT; } if (*arc_flags & ARC_FLAG_PREFETCH && zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) { if (HDR_HAS_L2HDR(hdr)) l2arc_hdr_arcstats_decrement_state(hdr); arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); if (HDR_HAS_L2HDR(hdr)) l2arc_hdr_arcstats_increment_state(hdr); } if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH) arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH); if (*arc_flags & ARC_FLAG_L2CACHE) arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); if (BP_IS_AUTHENTICATED(bp)) arc_hdr_set_flags(hdr, ARC_FLAG_NOAUTH); if (BP_GET_LEVEL(bp) > 0) arc_hdr_set_flags(hdr, ARC_FLAG_INDIRECT); if (*arc_flags & ARC_FLAG_PREDICTIVE_PREFETCH) arc_hdr_set_flags(hdr, ARC_FLAG_PREDICTIVE_PREFETCH); ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state)); acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); acb->acb_done = done; acb->acb_private = private; acb->acb_compressed = compressed_read; acb->acb_encrypted = encrypted_read; acb->acb_noauth = noauth_read; acb->acb_zb = *zb; ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); hdr->b_l1hdr.b_acb = acb; arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); if (HDR_HAS_L2HDR(hdr) && (vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) { devw = hdr->b_l2hdr.b_dev->l2ad_writing; addr = hdr->b_l2hdr.b_daddr; /* * Lock out L2ARC device removal. */ if (vdev_is_dead(vd) || !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER)) vd = NULL; } /* * We count both async reads and scrub IOs as asynchronous so * that both can be upgraded in the event of a cache hit while * the read IO is still in-flight. */ if (priority == ZIO_PRIORITY_ASYNC_READ || priority == ZIO_PRIORITY_SCRUB) arc_hdr_set_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ); else arc_hdr_clear_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ); /* * At this point, we have a level 1 cache miss or a blkptr * with embedded data. Try again in L2ARC if possible. */ ASSERT3U(HDR_GET_LSIZE(hdr), ==, lsize); /* * Skip ARC stat bump for block pointers with embedded * data. The data are read from the blkptr itself via * decode_embedded_bp_compressed(). */ if (!embedded_bp) { DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp, uint64_t, lsize, zbookmark_phys_t *, zb); ARCSTAT_BUMP(arcstat_misses); ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data, metadata, misses); zfs_racct_read(size, 1); } /* Check if the spa even has l2 configured */ const boolean_t spa_has_l2 = l2arc_ndev != 0 && spa->spa_l2cache.sav_count > 0; if (vd != NULL && spa_has_l2 && !(l2arc_norw && devw)) { /* * Read from the L2ARC if the following are true: * 1. The L2ARC vdev was previously cached. * 2. This buffer still has L2ARC metadata. * 3. This buffer isn't currently writing to the L2ARC. * 4. The L2ARC entry wasn't evicted, which may * also have invalidated the vdev. * 5. This isn't prefetch or l2arc_noprefetch is 0. */ if (HDR_HAS_L2HDR(hdr) && !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) && !(l2arc_noprefetch && HDR_PREFETCH(hdr))) { l2arc_read_callback_t *cb; abd_t *abd; uint64_t asize; DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr); ARCSTAT_BUMP(arcstat_l2_hits); hdr->b_l2hdr.b_hits++; cb = kmem_zalloc(sizeof (l2arc_read_callback_t), KM_SLEEP); cb->l2rcb_hdr = hdr; cb->l2rcb_bp = *bp; cb->l2rcb_zb = *zb; cb->l2rcb_flags = zio_flags; /* * When Compressed ARC is disabled, but the * L2ARC block is compressed, arc_hdr_size() * will have returned LSIZE rather than PSIZE. */ if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && !HDR_COMPRESSION_ENABLED(hdr) && HDR_GET_PSIZE(hdr) != 0) { size = HDR_GET_PSIZE(hdr); } asize = vdev_psize_to_asize(vd, size); if (asize != size) { abd = abd_alloc_for_io(asize, HDR_ISTYPE_METADATA(hdr)); cb->l2rcb_abd = abd; } else { abd = hdr_abd; } ASSERT(addr >= VDEV_LABEL_START_SIZE && addr + asize <= vd->vdev_psize - VDEV_LABEL_END_SIZE); /* * l2arc read. The SCL_L2ARC lock will be * released by l2arc_read_done(). * Issue a null zio if the underlying buffer * was squashed to zero size by compression. */ ASSERT3U(arc_hdr_get_compress(hdr), !=, ZIO_COMPRESS_EMPTY); rzio = zio_read_phys(pio, vd, addr, asize, abd, ZIO_CHECKSUM_OFF, l2arc_read_done, cb, priority, zio_flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE); acb->acb_zio_head = rzio; if (hash_lock != NULL) mutex_exit(hash_lock); DTRACE_PROBE2(l2arc__read, vdev_t *, vd, zio_t *, rzio); ARCSTAT_INCR(arcstat_l2_read_bytes, HDR_GET_PSIZE(hdr)); if (*arc_flags & ARC_FLAG_NOWAIT) { zio_nowait(rzio); goto out; } ASSERT(*arc_flags & ARC_FLAG_WAIT); if (zio_wait(rzio) == 0) goto out; /* l2arc read error; goto zio_read() */ if (hash_lock != NULL) mutex_enter(hash_lock); } else { DTRACE_PROBE1(l2arc__miss, arc_buf_hdr_t *, hdr); ARCSTAT_BUMP(arcstat_l2_misses); if (HDR_L2_WRITING(hdr)) ARCSTAT_BUMP(arcstat_l2_rw_clash); spa_config_exit(spa, SCL_L2ARC, vd); } } else { if (vd != NULL) spa_config_exit(spa, SCL_L2ARC, vd); /* * Only a spa with l2 should contribute to l2 * miss stats. (Including the case of having a * faulted cache device - that's also a miss.) */ if (spa_has_l2) { /* * Skip ARC stat bump for block pointers with * embedded data. The data are read from the * blkptr itself via * decode_embedded_bp_compressed(). */ if (!embedded_bp) { DTRACE_PROBE1(l2arc__miss, arc_buf_hdr_t *, hdr); ARCSTAT_BUMP(arcstat_l2_misses); } } } rzio = zio_read(pio, spa, bp, hdr_abd, size, arc_read_done, hdr, priority, zio_flags, zb); acb->acb_zio_head = rzio; if (hash_lock != NULL) mutex_exit(hash_lock); if (*arc_flags & ARC_FLAG_WAIT) { rc = zio_wait(rzio); goto out; } ASSERT(*arc_flags & ARC_FLAG_NOWAIT); zio_nowait(rzio); } out: /* embedded bps don't actually go to disk */ if (!embedded_bp) spa_read_history_add(spa, zb, *arc_flags); spl_fstrans_unmark(cookie); return (rc); } arc_prune_t * arc_add_prune_callback(arc_prune_func_t *func, void *private) { arc_prune_t *p; p = kmem_alloc(sizeof (*p), KM_SLEEP); p->p_pfunc = func; p->p_private = private; list_link_init(&p->p_node); zfs_refcount_create(&p->p_refcnt); mutex_enter(&arc_prune_mtx); zfs_refcount_add(&p->p_refcnt, &arc_prune_list); list_insert_head(&arc_prune_list, p); mutex_exit(&arc_prune_mtx); return (p); } void arc_remove_prune_callback(arc_prune_t *p) { boolean_t wait = B_FALSE; mutex_enter(&arc_prune_mtx); list_remove(&arc_prune_list, p); if (zfs_refcount_remove(&p->p_refcnt, &arc_prune_list) > 0) wait = B_TRUE; mutex_exit(&arc_prune_mtx); /* wait for arc_prune_task to finish */ if (wait) taskq_wait_outstanding(arc_prune_taskq, 0); ASSERT0(zfs_refcount_count(&p->p_refcnt)); zfs_refcount_destroy(&p->p_refcnt); kmem_free(p, sizeof (*p)); } /* * Notify the arc that a block was freed, and thus will never be used again. */ void arc_freed(spa_t *spa, const blkptr_t *bp) { arc_buf_hdr_t *hdr; kmutex_t *hash_lock; uint64_t guid = spa_load_guid(spa); ASSERT(!BP_IS_EMBEDDED(bp)); hdr = buf_hash_find(guid, bp, &hash_lock); if (hdr == NULL) return; /* * We might be trying to free a block that is still doing I/O * (i.e. prefetch) or has a reference (i.e. a dedup-ed, * dmu_sync-ed block). If this block is being prefetched, then it * would still have the ARC_FLAG_IO_IN_PROGRESS flag set on the hdr * until the I/O completes. A block may also have a reference if it is * part of a dedup-ed, dmu_synced write. The dmu_sync() function would * have written the new block to its final resting place on disk but * without the dedup flag set. This would have left the hdr in the MRU * state and discoverable. When the txg finally syncs it detects that * the block was overridden in open context and issues an override I/O. * Since this is a dedup block, the override I/O will determine if the * block is already in the DDT. If so, then it will replace the io_bp * with the bp from the DDT and allow the I/O to finish. When the I/O * reaches the done callback, dbuf_write_override_done, it will * check to see if the io_bp and io_bp_override are identical. * If they are not, then it indicates that the bp was replaced with * the bp in the DDT and the override bp is freed. This allows * us to arrive here with a reference on a block that is being * freed. So if we have an I/O in progress, or a reference to * this hdr, then we don't destroy the hdr. */ if (!HDR_HAS_L1HDR(hdr) || (!HDR_IO_IN_PROGRESS(hdr) && zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt))) { arc_change_state(arc_anon, hdr, hash_lock); arc_hdr_destroy(hdr); mutex_exit(hash_lock); } else { mutex_exit(hash_lock); } } /* * Release this buffer from the cache, making it an anonymous buffer. This * must be done after a read and prior to modifying the buffer contents. * If the buffer has more than one reference, we must make * a new hdr for the buffer. */ void arc_release(arc_buf_t *buf, const void *tag) { arc_buf_hdr_t *hdr = buf->b_hdr; /* * It would be nice to assert that if its DMU metadata (level > * 0 || it's the dnode file), then it must be syncing context. * But we don't know that information at this level. */ mutex_enter(&buf->b_evict_lock); ASSERT(HDR_HAS_L1HDR(hdr)); /* * We don't grab the hash lock prior to this check, because if * the buffer's header is in the arc_anon state, it won't be * linked into the hash table. */ if (hdr->b_l1hdr.b_state == arc_anon) { mutex_exit(&buf->b_evict_lock); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT(!HDR_IN_HASH_TABLE(hdr)); ASSERT(!HDR_HAS_L2HDR(hdr)); ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); ASSERT3S(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1); ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); hdr->b_l1hdr.b_arc_access = 0; /* * If the buf is being overridden then it may already * have a hdr that is not empty. */ buf_discard_identity(hdr); arc_buf_thaw(buf); return; } kmutex_t *hash_lock = HDR_LOCK(hdr); mutex_enter(hash_lock); /* * This assignment is only valid as long as the hash_lock is * held, we must be careful not to reference state or the * b_state field after dropping the lock. */ arc_state_t *state = hdr->b_l1hdr.b_state; ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); ASSERT3P(state, !=, arc_anon); /* this buffer is not on any list */ ASSERT3S(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt), >, 0); if (HDR_HAS_L2HDR(hdr)) { mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx); /* * We have to recheck this conditional again now that * we're holding the l2ad_mtx to prevent a race with * another thread which might be concurrently calling * l2arc_evict(). In that case, l2arc_evict() might have * destroyed the header's L2 portion as we were waiting * to acquire the l2ad_mtx. */ if (HDR_HAS_L2HDR(hdr)) arc_hdr_l2hdr_destroy(hdr); mutex_exit(&hdr->b_l2hdr.b_dev->l2ad_mtx); } /* * Do we have more than one buf? */ if (hdr->b_l1hdr.b_bufcnt > 1) { arc_buf_hdr_t *nhdr; uint64_t spa = hdr->b_spa; uint64_t psize = HDR_GET_PSIZE(hdr); uint64_t lsize = HDR_GET_LSIZE(hdr); boolean_t protected = HDR_PROTECTED(hdr); enum zio_compress compress = arc_hdr_get_compress(hdr); arc_buf_contents_t type = arc_buf_type(hdr); VERIFY3U(hdr->b_type, ==, type); ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL); (void) remove_reference(hdr, hash_lock, tag); if (arc_buf_is_shared(buf) && !ARC_BUF_COMPRESSED(buf)) { ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf); ASSERT(ARC_BUF_LAST(buf)); } /* * Pull the data off of this hdr and attach it to * a new anonymous hdr. Also find the last buffer * in the hdr's buffer list. */ arc_buf_t *lastbuf = arc_buf_remove(hdr, buf); ASSERT3P(lastbuf, !=, NULL); /* * If the current arc_buf_t and the hdr are sharing their data * buffer, then we must stop sharing that block. */ if (arc_buf_is_shared(buf)) { ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf); VERIFY(!arc_buf_is_shared(lastbuf)); /* * First, sever the block sharing relationship between * buf and the arc_buf_hdr_t. */ arc_unshare_buf(hdr, buf); /* * Now we need to recreate the hdr's b_pabd. Since we * have lastbuf handy, we try to share with it, but if * we can't then we allocate a new b_pabd and copy the * data from buf into it. */ if (arc_can_share(hdr, lastbuf)) { arc_share_buf(hdr, lastbuf); } else { arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT); abd_copy_from_buf(hdr->b_l1hdr.b_pabd, buf->b_data, psize); } VERIFY3P(lastbuf->b_data, !=, NULL); } else if (HDR_SHARED_DATA(hdr)) { /* * Uncompressed shared buffers are always at the end * of the list. Compressed buffers don't have the * same requirements. This makes it hard to * simply assert that the lastbuf is shared so * we rely on the hdr's compression flags to determine * if we have a compressed, shared buffer. */ ASSERT(arc_buf_is_shared(lastbuf) || arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF); ASSERT(!ARC_BUF_SHARED(buf)); } ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr)); ASSERT3P(state, !=, arc_l2c_only); (void) zfs_refcount_remove_many(&state->arcs_size, arc_buf_size(buf), buf); if (zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) { ASSERT3P(state, !=, arc_l2c_only); (void) zfs_refcount_remove_many( &state->arcs_esize[type], arc_buf_size(buf), buf); } hdr->b_l1hdr.b_bufcnt -= 1; if (ARC_BUF_ENCRYPTED(buf)) hdr->b_crypt_hdr.b_ebufcnt -= 1; arc_cksum_verify(buf); arc_buf_unwatch(buf); /* if this is the last uncompressed buf free the checksum */ if (!arc_hdr_has_uncompressed_buf(hdr)) arc_cksum_free(hdr); mutex_exit(hash_lock); /* * Allocate a new hdr. The new hdr will contain a b_pabd * buffer which will be freed in arc_write(). */ nhdr = arc_hdr_alloc(spa, psize, lsize, protected, compress, hdr->b_complevel, type); ASSERT3P(nhdr->b_l1hdr.b_buf, ==, NULL); ASSERT0(nhdr->b_l1hdr.b_bufcnt); ASSERT0(zfs_refcount_count(&nhdr->b_l1hdr.b_refcnt)); VERIFY3U(nhdr->b_type, ==, type); ASSERT(!HDR_SHARED_DATA(nhdr)); nhdr->b_l1hdr.b_buf = buf; nhdr->b_l1hdr.b_bufcnt = 1; if (ARC_BUF_ENCRYPTED(buf)) nhdr->b_crypt_hdr.b_ebufcnt = 1; (void) zfs_refcount_add(&nhdr->b_l1hdr.b_refcnt, tag); buf->b_hdr = nhdr; mutex_exit(&buf->b_evict_lock); (void) zfs_refcount_add_many(&arc_anon->arcs_size, arc_buf_size(buf), buf); } else { mutex_exit(&buf->b_evict_lock); ASSERT(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) == 1); /* protected by hash lock, or hdr is on arc_anon */ ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); hdr->b_l1hdr.b_mru_hits = 0; hdr->b_l1hdr.b_mru_ghost_hits = 0; hdr->b_l1hdr.b_mfu_hits = 0; hdr->b_l1hdr.b_mfu_ghost_hits = 0; arc_change_state(arc_anon, hdr, hash_lock); hdr->b_l1hdr.b_arc_access = 0; mutex_exit(hash_lock); buf_discard_identity(hdr); arc_buf_thaw(buf); } } int arc_released(arc_buf_t *buf) { int released; mutex_enter(&buf->b_evict_lock); released = (buf->b_data != NULL && buf->b_hdr->b_l1hdr.b_state == arc_anon); mutex_exit(&buf->b_evict_lock); return (released); } #ifdef ZFS_DEBUG int arc_referenced(arc_buf_t *buf) { int referenced; mutex_enter(&buf->b_evict_lock); referenced = (zfs_refcount_count(&buf->b_hdr->b_l1hdr.b_refcnt)); mutex_exit(&buf->b_evict_lock); return (referenced); } #endif static void arc_write_ready(zio_t *zio) { arc_write_callback_t *callback = zio->io_private; arc_buf_t *buf = callback->awcb_buf; arc_buf_hdr_t *hdr = buf->b_hdr; blkptr_t *bp = zio->io_bp; uint64_t psize = BP_IS_HOLE(bp) ? 0 : BP_GET_PSIZE(bp); fstrans_cookie_t cookie = spl_fstrans_mark(); ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(!zfs_refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt)); ASSERT(hdr->b_l1hdr.b_bufcnt > 0); /* * If we're reexecuting this zio because the pool suspended, then * cleanup any state that was previously set the first time the * callback was invoked. */ if (zio->io_flags & ZIO_FLAG_REEXECUTED) { arc_cksum_free(hdr); arc_buf_unwatch(buf); if (hdr->b_l1hdr.b_pabd != NULL) { if (arc_buf_is_shared(buf)) { arc_unshare_buf(hdr, buf); } else { arc_hdr_free_abd(hdr, B_FALSE); } } if (HDR_HAS_RABD(hdr)) arc_hdr_free_abd(hdr, B_TRUE); } ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(!HDR_HAS_RABD(hdr)); ASSERT(!HDR_SHARED_DATA(hdr)); ASSERT(!arc_buf_is_shared(buf)); callback->awcb_ready(zio, buf, callback->awcb_private); if (HDR_IO_IN_PROGRESS(hdr)) ASSERT(zio->io_flags & ZIO_FLAG_REEXECUTED); arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); if (BP_IS_PROTECTED(bp) != !!HDR_PROTECTED(hdr)) hdr = arc_hdr_realloc_crypt(hdr, BP_IS_PROTECTED(bp)); if (BP_IS_PROTECTED(bp)) { /* ZIL blocks are written through zio_rewrite */ ASSERT3U(BP_GET_TYPE(bp), !=, DMU_OT_INTENT_LOG); ASSERT(HDR_PROTECTED(hdr)); if (BP_SHOULD_BYTESWAP(bp)) { if (BP_GET_LEVEL(bp) > 0) { hdr->b_l1hdr.b_byteswap = DMU_BSWAP_UINT64; } else { hdr->b_l1hdr.b_byteswap = DMU_OT_BYTESWAP(BP_GET_TYPE(bp)); } } else { hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; } hdr->b_crypt_hdr.b_ot = BP_GET_TYPE(bp); hdr->b_crypt_hdr.b_dsobj = zio->io_bookmark.zb_objset; zio_crypt_decode_params_bp(bp, hdr->b_crypt_hdr.b_salt, hdr->b_crypt_hdr.b_iv); zio_crypt_decode_mac_bp(bp, hdr->b_crypt_hdr.b_mac); } /* * If this block was written for raw encryption but the zio layer * ended up only authenticating it, adjust the buffer flags now. */ if (BP_IS_AUTHENTICATED(bp) && ARC_BUF_ENCRYPTED(buf)) { arc_hdr_set_flags(hdr, ARC_FLAG_NOAUTH); buf->b_flags &= ~ARC_BUF_FLAG_ENCRYPTED; if (BP_GET_COMPRESS(bp) == ZIO_COMPRESS_OFF) buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED; } else if (BP_IS_HOLE(bp) && ARC_BUF_ENCRYPTED(buf)) { buf->b_flags &= ~ARC_BUF_FLAG_ENCRYPTED; buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED; } /* this must be done after the buffer flags are adjusted */ arc_cksum_compute(buf); enum zio_compress compress; if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) { compress = ZIO_COMPRESS_OFF; } else { ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(bp)); compress = BP_GET_COMPRESS(bp); } HDR_SET_PSIZE(hdr, psize); arc_hdr_set_compress(hdr, compress); hdr->b_complevel = zio->io_prop.zp_complevel; if (zio->io_error != 0 || psize == 0) goto out; /* * Fill the hdr with data. If the buffer is encrypted we have no choice * but to copy the data into b_radb. If the hdr is compressed, the data * we want is available from the zio, otherwise we can take it from * the buf. * * We might be able to share the buf's data with the hdr here. However, * doing so would cause the ARC to be full of linear ABDs if we write a * lot of shareable data. As a compromise, we check whether scattered * ABDs are allowed, and assume that if they are then the user wants * the ARC to be primarily filled with them regardless of the data being * written. Therefore, if they're allowed then we allocate one and copy * the data into it; otherwise, we share the data directly if we can. */ if (ARC_BUF_ENCRYPTED(buf)) { ASSERT3U(psize, >, 0); ASSERT(ARC_BUF_COMPRESSED(buf)); arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT | ARC_HDR_ALLOC_RDATA | ARC_HDR_USE_RESERVE); abd_copy(hdr->b_crypt_hdr.b_rabd, zio->io_abd, psize); } else if (!abd_size_alloc_linear(arc_buf_size(buf)) || !arc_can_share(hdr, buf)) { /* * Ideally, we would always copy the io_abd into b_pabd, but the * user may have disabled compressed ARC, thus we must check the * hdr's compression setting rather than the io_bp's. */ if (BP_IS_ENCRYPTED(bp)) { ASSERT3U(psize, >, 0); arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT | ARC_HDR_ALLOC_RDATA | ARC_HDR_USE_RESERVE); abd_copy(hdr->b_crypt_hdr.b_rabd, zio->io_abd, psize); } else if (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF && !ARC_BUF_COMPRESSED(buf)) { ASSERT3U(psize, >, 0); arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT | ARC_HDR_USE_RESERVE); abd_copy(hdr->b_l1hdr.b_pabd, zio->io_abd, psize); } else { ASSERT3U(zio->io_orig_size, ==, arc_hdr_size(hdr)); arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT | ARC_HDR_USE_RESERVE); abd_copy_from_buf(hdr->b_l1hdr.b_pabd, buf->b_data, arc_buf_size(buf)); } } else { ASSERT3P(buf->b_data, ==, abd_to_buf(zio->io_orig_abd)); ASSERT3U(zio->io_orig_size, ==, arc_buf_size(buf)); ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); arc_share_buf(hdr, buf); } out: arc_hdr_verify(hdr, bp); spl_fstrans_unmark(cookie); } static void arc_write_children_ready(zio_t *zio) { arc_write_callback_t *callback = zio->io_private; arc_buf_t *buf = callback->awcb_buf; callback->awcb_children_ready(zio, buf, callback->awcb_private); } /* * The SPA calls this callback for each physical write that happens on behalf * of a logical write. See the comment in dbuf_write_physdone() for details. */ static void arc_write_physdone(zio_t *zio) { arc_write_callback_t *cb = zio->io_private; if (cb->awcb_physdone != NULL) cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private); } static void arc_write_done(zio_t *zio) { arc_write_callback_t *callback = zio->io_private; arc_buf_t *buf = callback->awcb_buf; arc_buf_hdr_t *hdr = buf->b_hdr; ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); if (zio->io_error == 0) { arc_hdr_verify(hdr, zio->io_bp); if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) { buf_discard_identity(hdr); } else { hdr->b_dva = *BP_IDENTITY(zio->io_bp); hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp); } } else { ASSERT(HDR_EMPTY(hdr)); } /* * If the block to be written was all-zero or compressed enough to be * embedded in the BP, no write was performed so there will be no * dva/birth/checksum. The buffer must therefore remain anonymous * (and uncached). */ if (!HDR_EMPTY(hdr)) { arc_buf_hdr_t *exists; kmutex_t *hash_lock; ASSERT3U(zio->io_error, ==, 0); arc_cksum_verify(buf); exists = buf_hash_insert(hdr, &hash_lock); if (exists != NULL) { /* * This can only happen if we overwrite for * sync-to-convergence, because we remove * buffers from the hash table when we arc_free(). */ if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) panic("bad overwrite, hdr=%p exists=%p", (void *)hdr, (void *)exists); ASSERT(zfs_refcount_is_zero( &exists->b_l1hdr.b_refcnt)); arc_change_state(arc_anon, exists, hash_lock); arc_hdr_destroy(exists); mutex_exit(hash_lock); exists = buf_hash_insert(hdr, &hash_lock); ASSERT3P(exists, ==, NULL); } else if (zio->io_flags & ZIO_FLAG_NOPWRITE) { /* nopwrite */ ASSERT(zio->io_prop.zp_nopwrite); if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) panic("bad nopwrite, hdr=%p exists=%p", (void *)hdr, (void *)exists); } else { /* Dedup */ ASSERT(hdr->b_l1hdr.b_bufcnt == 1); ASSERT(hdr->b_l1hdr.b_state == arc_anon); ASSERT(BP_GET_DEDUP(zio->io_bp)); ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); } } arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); /* if it's not anon, we are doing a scrub */ if (exists == NULL && hdr->b_l1hdr.b_state == arc_anon) arc_access(hdr, hash_lock); mutex_exit(hash_lock); } else { arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); } ASSERT(!zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); callback->awcb_done(zio, buf, callback->awcb_private); abd_free(zio->io_abd); kmem_free(callback, sizeof (arc_write_callback_t)); } zio_t * arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, const zio_prop_t *zp, arc_write_done_func_t *ready, arc_write_done_func_t *children_ready, arc_write_done_func_t *physdone, arc_write_done_func_t *done, void *private, zio_priority_t priority, int zio_flags, const zbookmark_phys_t *zb) { arc_buf_hdr_t *hdr = buf->b_hdr; arc_write_callback_t *callback; zio_t *zio; zio_prop_t localprop = *zp; ASSERT3P(ready, !=, NULL); ASSERT3P(done, !=, NULL); ASSERT(!HDR_IO_ERROR(hdr)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0); if (l2arc) arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); if (ARC_BUF_ENCRYPTED(buf)) { ASSERT(ARC_BUF_COMPRESSED(buf)); localprop.zp_encrypt = B_TRUE; localprop.zp_compress = HDR_GET_COMPRESS(hdr); localprop.zp_complevel = hdr->b_complevel; localprop.zp_byteorder = (hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS) ? ZFS_HOST_BYTEORDER : !ZFS_HOST_BYTEORDER; memcpy(localprop.zp_salt, hdr->b_crypt_hdr.b_salt, ZIO_DATA_SALT_LEN); memcpy(localprop.zp_iv, hdr->b_crypt_hdr.b_iv, ZIO_DATA_IV_LEN); memcpy(localprop.zp_mac, hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN); if (DMU_OT_IS_ENCRYPTED(localprop.zp_type)) { localprop.zp_nopwrite = B_FALSE; localprop.zp_copies = MIN(localprop.zp_copies, SPA_DVAS_PER_BP - 1); } zio_flags |= ZIO_FLAG_RAW; } else if (ARC_BUF_COMPRESSED(buf)) { ASSERT3U(HDR_GET_LSIZE(hdr), !=, arc_buf_size(buf)); localprop.zp_compress = HDR_GET_COMPRESS(hdr); localprop.zp_complevel = hdr->b_complevel; zio_flags |= ZIO_FLAG_RAW_COMPRESS; } callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); callback->awcb_ready = ready; callback->awcb_children_ready = children_ready; callback->awcb_physdone = physdone; callback->awcb_done = done; callback->awcb_private = private; callback->awcb_buf = buf; /* * The hdr's b_pabd is now stale, free it now. A new data block * will be allocated when the zio pipeline calls arc_write_ready(). */ if (hdr->b_l1hdr.b_pabd != NULL) { /* * If the buf is currently sharing the data block with * the hdr then we need to break that relationship here. * The hdr will remain with a NULL data pointer and the * buf will take sole ownership of the block. */ if (arc_buf_is_shared(buf)) { arc_unshare_buf(hdr, buf); } else { arc_hdr_free_abd(hdr, B_FALSE); } VERIFY3P(buf->b_data, !=, NULL); } if (HDR_HAS_RABD(hdr)) arc_hdr_free_abd(hdr, B_TRUE); if (!(zio_flags & ZIO_FLAG_RAW)) arc_hdr_set_compress(hdr, ZIO_COMPRESS_OFF); ASSERT(!arc_buf_is_shared(buf)); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); zio = zio_write(pio, spa, txg, bp, abd_get_from_buf(buf->b_data, HDR_GET_LSIZE(hdr)), HDR_GET_LSIZE(hdr), arc_buf_size(buf), &localprop, arc_write_ready, (children_ready != NULL) ? arc_write_children_ready : NULL, arc_write_physdone, arc_write_done, callback, priority, zio_flags, zb); return (zio); } void arc_tempreserve_clear(uint64_t reserve) { atomic_add_64(&arc_tempreserve, -reserve); ASSERT((int64_t)arc_tempreserve >= 0); } int arc_tempreserve_space(spa_t *spa, uint64_t reserve, uint64_t txg) { int error; uint64_t anon_size; if (!arc_no_grow && reserve > arc_c/4 && reserve * 4 > (2ULL << SPA_MAXBLOCKSHIFT)) arc_c = MIN(arc_c_max, reserve * 4); /* * Throttle when the calculated memory footprint for the TXG * exceeds the target ARC size. */ if (reserve > arc_c) { DMU_TX_STAT_BUMP(dmu_tx_memory_reserve); return (SET_ERROR(ERESTART)); } /* * Don't count loaned bufs as in flight dirty data to prevent long * network delays from blocking transactions that are ready to be * assigned to a txg. */ /* assert that it has not wrapped around */ ASSERT3S(atomic_add_64_nv(&arc_loaned_bytes, 0), >=, 0); anon_size = MAX((int64_t)(zfs_refcount_count(&arc_anon->arcs_size) - arc_loaned_bytes), 0); /* * Writes will, almost always, require additional memory allocations * in order to compress/encrypt/etc the data. We therefore need to * make sure that there is sufficient available memory for this. */ error = arc_memory_throttle(spa, reserve, txg); if (error != 0) return (error); /* * Throttle writes when the amount of dirty data in the cache * gets too large. We try to keep the cache less than half full * of dirty blocks so that our sync times don't grow too large. * * In the case of one pool being built on another pool, we want * to make sure we don't end up throttling the lower (backing) * pool when the upper pool is the majority contributor to dirty * data. To insure we make forward progress during throttling, we * also check the current pool's net dirty data and only throttle * if it exceeds zfs_arc_pool_dirty_percent of the anonymous dirty * data in the cache. * * Note: if two requests come in concurrently, we might let them * both succeed, when one of them should fail. Not a huge deal. */ uint64_t total_dirty = reserve + arc_tempreserve + anon_size; uint64_t spa_dirty_anon = spa_dirty_data(spa); uint64_t rarc_c = arc_warm ? arc_c : arc_c_max; if (total_dirty > rarc_c * zfs_arc_dirty_limit_percent / 100 && anon_size > rarc_c * zfs_arc_anon_limit_percent / 100 && spa_dirty_anon > anon_size * zfs_arc_pool_dirty_percent / 100) { #ifdef ZFS_DEBUG uint64_t meta_esize = zfs_refcount_count( &arc_anon->arcs_esize[ARC_BUFC_METADATA]); uint64_t data_esize = zfs_refcount_count(&arc_anon->arcs_esize[ARC_BUFC_DATA]); dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK " "anon_data=%lluK tempreserve=%lluK rarc_c=%lluK\n", (u_longlong_t)arc_tempreserve >> 10, (u_longlong_t)meta_esize >> 10, (u_longlong_t)data_esize >> 10, (u_longlong_t)reserve >> 10, (u_longlong_t)rarc_c >> 10); #endif DMU_TX_STAT_BUMP(dmu_tx_dirty_throttle); return (SET_ERROR(ERESTART)); } atomic_add_64(&arc_tempreserve, reserve); return (0); } static void arc_kstat_update_state(arc_state_t *state, kstat_named_t *size, kstat_named_t *evict_data, kstat_named_t *evict_metadata) { size->value.ui64 = zfs_refcount_count(&state->arcs_size); evict_data->value.ui64 = zfs_refcount_count(&state->arcs_esize[ARC_BUFC_DATA]); evict_metadata->value.ui64 = zfs_refcount_count(&state->arcs_esize[ARC_BUFC_METADATA]); } static int arc_kstat_update(kstat_t *ksp, int rw) { arc_stats_t *as = ksp->ks_data; if (rw == KSTAT_WRITE) return (SET_ERROR(EACCES)); as->arcstat_hits.value.ui64 = wmsum_value(&arc_sums.arcstat_hits); as->arcstat_misses.value.ui64 = wmsum_value(&arc_sums.arcstat_misses); as->arcstat_demand_data_hits.value.ui64 = wmsum_value(&arc_sums.arcstat_demand_data_hits); as->arcstat_demand_data_misses.value.ui64 = wmsum_value(&arc_sums.arcstat_demand_data_misses); as->arcstat_demand_metadata_hits.value.ui64 = wmsum_value(&arc_sums.arcstat_demand_metadata_hits); as->arcstat_demand_metadata_misses.value.ui64 = wmsum_value(&arc_sums.arcstat_demand_metadata_misses); as->arcstat_prefetch_data_hits.value.ui64 = wmsum_value(&arc_sums.arcstat_prefetch_data_hits); as->arcstat_prefetch_data_misses.value.ui64 = wmsum_value(&arc_sums.arcstat_prefetch_data_misses); as->arcstat_prefetch_metadata_hits.value.ui64 = wmsum_value(&arc_sums.arcstat_prefetch_metadata_hits); as->arcstat_prefetch_metadata_misses.value.ui64 = wmsum_value(&arc_sums.arcstat_prefetch_metadata_misses); as->arcstat_mru_hits.value.ui64 = wmsum_value(&arc_sums.arcstat_mru_hits); as->arcstat_mru_ghost_hits.value.ui64 = wmsum_value(&arc_sums.arcstat_mru_ghost_hits); as->arcstat_mfu_hits.value.ui64 = wmsum_value(&arc_sums.arcstat_mfu_hits); as->arcstat_mfu_ghost_hits.value.ui64 = wmsum_value(&arc_sums.arcstat_mfu_ghost_hits); as->arcstat_deleted.value.ui64 = wmsum_value(&arc_sums.arcstat_deleted); as->arcstat_mutex_miss.value.ui64 = wmsum_value(&arc_sums.arcstat_mutex_miss); as->arcstat_access_skip.value.ui64 = wmsum_value(&arc_sums.arcstat_access_skip); as->arcstat_evict_skip.value.ui64 = wmsum_value(&arc_sums.arcstat_evict_skip); as->arcstat_evict_not_enough.value.ui64 = wmsum_value(&arc_sums.arcstat_evict_not_enough); as->arcstat_evict_l2_cached.value.ui64 = wmsum_value(&arc_sums.arcstat_evict_l2_cached); as->arcstat_evict_l2_eligible.value.ui64 = wmsum_value(&arc_sums.arcstat_evict_l2_eligible); as->arcstat_evict_l2_eligible_mfu.value.ui64 = wmsum_value(&arc_sums.arcstat_evict_l2_eligible_mfu); as->arcstat_evict_l2_eligible_mru.value.ui64 = wmsum_value(&arc_sums.arcstat_evict_l2_eligible_mru); as->arcstat_evict_l2_ineligible.value.ui64 = wmsum_value(&arc_sums.arcstat_evict_l2_ineligible); as->arcstat_evict_l2_skip.value.ui64 = wmsum_value(&arc_sums.arcstat_evict_l2_skip); as->arcstat_hash_collisions.value.ui64 = wmsum_value(&arc_sums.arcstat_hash_collisions); as->arcstat_hash_chains.value.ui64 = wmsum_value(&arc_sums.arcstat_hash_chains); as->arcstat_size.value.ui64 = aggsum_value(&arc_sums.arcstat_size); as->arcstat_compressed_size.value.ui64 = wmsum_value(&arc_sums.arcstat_compressed_size); as->arcstat_uncompressed_size.value.ui64 = wmsum_value(&arc_sums.arcstat_uncompressed_size); as->arcstat_overhead_size.value.ui64 = wmsum_value(&arc_sums.arcstat_overhead_size); as->arcstat_hdr_size.value.ui64 = wmsum_value(&arc_sums.arcstat_hdr_size); as->arcstat_data_size.value.ui64 = wmsum_value(&arc_sums.arcstat_data_size); as->arcstat_metadata_size.value.ui64 = wmsum_value(&arc_sums.arcstat_metadata_size); as->arcstat_dbuf_size.value.ui64 = wmsum_value(&arc_sums.arcstat_dbuf_size); #if defined(COMPAT_FREEBSD11) as->arcstat_other_size.value.ui64 = wmsum_value(&arc_sums.arcstat_bonus_size) + aggsum_value(&arc_sums.arcstat_dnode_size) + wmsum_value(&arc_sums.arcstat_dbuf_size); #endif arc_kstat_update_state(arc_anon, &as->arcstat_anon_size, &as->arcstat_anon_evictable_data, &as->arcstat_anon_evictable_metadata); arc_kstat_update_state(arc_mru, &as->arcstat_mru_size, &as->arcstat_mru_evictable_data, &as->arcstat_mru_evictable_metadata); arc_kstat_update_state(arc_mru_ghost, &as->arcstat_mru_ghost_size, &as->arcstat_mru_ghost_evictable_data, &as->arcstat_mru_ghost_evictable_metadata); arc_kstat_update_state(arc_mfu, &as->arcstat_mfu_size, &as->arcstat_mfu_evictable_data, &as->arcstat_mfu_evictable_metadata); arc_kstat_update_state(arc_mfu_ghost, &as->arcstat_mfu_ghost_size, &as->arcstat_mfu_ghost_evictable_data, &as->arcstat_mfu_ghost_evictable_metadata); as->arcstat_dnode_size.value.ui64 = aggsum_value(&arc_sums.arcstat_dnode_size); as->arcstat_bonus_size.value.ui64 = wmsum_value(&arc_sums.arcstat_bonus_size); as->arcstat_l2_hits.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_hits); as->arcstat_l2_misses.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_misses); as->arcstat_l2_prefetch_asize.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_prefetch_asize); as->arcstat_l2_mru_asize.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_mru_asize); as->arcstat_l2_mfu_asize.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_mfu_asize); as->arcstat_l2_bufc_data_asize.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_bufc_data_asize); as->arcstat_l2_bufc_metadata_asize.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_bufc_metadata_asize); as->arcstat_l2_feeds.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_feeds); as->arcstat_l2_rw_clash.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_rw_clash); as->arcstat_l2_read_bytes.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_read_bytes); as->arcstat_l2_write_bytes.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_write_bytes); as->arcstat_l2_writes_sent.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_writes_sent); as->arcstat_l2_writes_done.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_writes_done); as->arcstat_l2_writes_error.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_writes_error); as->arcstat_l2_writes_lock_retry.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_writes_lock_retry); as->arcstat_l2_evict_lock_retry.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_evict_lock_retry); as->arcstat_l2_evict_reading.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_evict_reading); as->arcstat_l2_evict_l1cached.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_evict_l1cached); as->arcstat_l2_free_on_write.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_free_on_write); as->arcstat_l2_abort_lowmem.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_abort_lowmem); as->arcstat_l2_cksum_bad.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_cksum_bad); as->arcstat_l2_io_error.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_io_error); as->arcstat_l2_lsize.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_lsize); as->arcstat_l2_psize.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_psize); as->arcstat_l2_hdr_size.value.ui64 = aggsum_value(&arc_sums.arcstat_l2_hdr_size); as->arcstat_l2_log_blk_writes.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_log_blk_writes); as->arcstat_l2_log_blk_asize.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_log_blk_asize); as->arcstat_l2_log_blk_count.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_log_blk_count); as->arcstat_l2_rebuild_success.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_rebuild_success); as->arcstat_l2_rebuild_abort_unsupported.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_rebuild_abort_unsupported); as->arcstat_l2_rebuild_abort_io_errors.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_rebuild_abort_io_errors); as->arcstat_l2_rebuild_abort_dh_errors.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_rebuild_abort_dh_errors); as->arcstat_l2_rebuild_abort_cksum_lb_errors.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_rebuild_abort_cksum_lb_errors); as->arcstat_l2_rebuild_abort_lowmem.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_rebuild_abort_lowmem); as->arcstat_l2_rebuild_size.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_rebuild_size); as->arcstat_l2_rebuild_asize.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_rebuild_asize); as->arcstat_l2_rebuild_bufs.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_rebuild_bufs); as->arcstat_l2_rebuild_bufs_precached.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_rebuild_bufs_precached); as->arcstat_l2_rebuild_log_blks.value.ui64 = wmsum_value(&arc_sums.arcstat_l2_rebuild_log_blks); as->arcstat_memory_throttle_count.value.ui64 = wmsum_value(&arc_sums.arcstat_memory_throttle_count); as->arcstat_memory_direct_count.value.ui64 = wmsum_value(&arc_sums.arcstat_memory_direct_count); as->arcstat_memory_indirect_count.value.ui64 = wmsum_value(&arc_sums.arcstat_memory_indirect_count); as->arcstat_memory_all_bytes.value.ui64 = arc_all_memory(); as->arcstat_memory_free_bytes.value.ui64 = arc_free_memory(); as->arcstat_memory_available_bytes.value.i64 = arc_available_memory(); as->arcstat_prune.value.ui64 = wmsum_value(&arc_sums.arcstat_prune); as->arcstat_meta_used.value.ui64 = aggsum_value(&arc_sums.arcstat_meta_used); as->arcstat_async_upgrade_sync.value.ui64 = wmsum_value(&arc_sums.arcstat_async_upgrade_sync); as->arcstat_demand_hit_predictive_prefetch.value.ui64 = wmsum_value(&arc_sums.arcstat_demand_hit_predictive_prefetch); as->arcstat_demand_hit_prescient_prefetch.value.ui64 = wmsum_value(&arc_sums.arcstat_demand_hit_prescient_prefetch); as->arcstat_raw_size.value.ui64 = wmsum_value(&arc_sums.arcstat_raw_size); as->arcstat_cached_only_in_progress.value.ui64 = wmsum_value(&arc_sums.arcstat_cached_only_in_progress); as->arcstat_abd_chunk_waste_size.value.ui64 = wmsum_value(&arc_sums.arcstat_abd_chunk_waste_size); return (0); } /* * This function *must* return indices evenly distributed between all * sublists of the multilist. This is needed due to how the ARC eviction * code is laid out; arc_evict_state() assumes ARC buffers are evenly * distributed between all sublists and uses this assumption when * deciding which sublist to evict from and how much to evict from it. */ static unsigned int arc_state_multilist_index_func(multilist_t *ml, void *obj) { arc_buf_hdr_t *hdr = obj; /* * We rely on b_dva to generate evenly distributed index * numbers using buf_hash below. So, as an added precaution, * let's make sure we never add empty buffers to the arc lists. */ ASSERT(!HDR_EMPTY(hdr)); /* * The assumption here, is the hash value for a given * arc_buf_hdr_t will remain constant throughout its lifetime * (i.e. its b_spa, b_dva, and b_birth fields don't change). * Thus, we don't need to store the header's sublist index * on insertion, as this index can be recalculated on removal. * * Also, the low order bits of the hash value are thought to be * distributed evenly. Otherwise, in the case that the multilist * has a power of two number of sublists, each sublists' usage * would not be evenly distributed. In this context full 64bit * division would be a waste of time, so limit it to 32 bits. */ return ((unsigned int)buf_hash(hdr->b_spa, &hdr->b_dva, hdr->b_birth) % multilist_get_num_sublists(ml)); } static unsigned int arc_state_l2c_multilist_index_func(multilist_t *ml, void *obj) { panic("Header %p insert into arc_l2c_only %p", obj, ml); } #define WARN_IF_TUNING_IGNORED(tuning, value, do_warn) do { \ if ((do_warn) && (tuning) && ((tuning) != (value))) { \ cmn_err(CE_WARN, \ "ignoring tunable %s (using %llu instead)", \ (#tuning), (u_longlong_t)(value)); \ } \ } while (0) /* * Called during module initialization and periodically thereafter to * apply reasonable changes to the exposed performance tunings. Can also be * called explicitly by param_set_arc_*() functions when ARC tunables are * updated manually. Non-zero zfs_* values which differ from the currently set * values will be applied. */ void arc_tuning_update(boolean_t verbose) { uint64_t allmem = arc_all_memory(); unsigned long limit; /* Valid range: 32M - */ if ((zfs_arc_min) && (zfs_arc_min != arc_c_min) && (zfs_arc_min >= 2ULL << SPA_MAXBLOCKSHIFT) && (zfs_arc_min <= arc_c_max)) { arc_c_min = zfs_arc_min; arc_c = MAX(arc_c, arc_c_min); } WARN_IF_TUNING_IGNORED(zfs_arc_min, arc_c_min, verbose); /* Valid range: 64M - */ if ((zfs_arc_max) && (zfs_arc_max != arc_c_max) && (zfs_arc_max >= MIN_ARC_MAX) && (zfs_arc_max < allmem) && (zfs_arc_max > arc_c_min)) { arc_c_max = zfs_arc_max; arc_c = MIN(arc_c, arc_c_max); arc_p = (arc_c >> 1); if (arc_meta_limit > arc_c_max) arc_meta_limit = arc_c_max; if (arc_dnode_size_limit > arc_meta_limit) arc_dnode_size_limit = arc_meta_limit; } WARN_IF_TUNING_IGNORED(zfs_arc_max, arc_c_max, verbose); /* Valid range: 16M - */ if ((zfs_arc_meta_min) && (zfs_arc_meta_min != arc_meta_min) && (zfs_arc_meta_min >= 1ULL << SPA_MAXBLOCKSHIFT) && (zfs_arc_meta_min <= arc_c_max)) { arc_meta_min = zfs_arc_meta_min; if (arc_meta_limit < arc_meta_min) arc_meta_limit = arc_meta_min; if (arc_dnode_size_limit < arc_meta_min) arc_dnode_size_limit = arc_meta_min; } WARN_IF_TUNING_IGNORED(zfs_arc_meta_min, arc_meta_min, verbose); /* Valid range: - */ limit = zfs_arc_meta_limit ? zfs_arc_meta_limit : MIN(zfs_arc_meta_limit_percent, 100) * arc_c_max / 100; if ((limit != arc_meta_limit) && (limit >= arc_meta_min) && (limit <= arc_c_max)) arc_meta_limit = limit; WARN_IF_TUNING_IGNORED(zfs_arc_meta_limit, arc_meta_limit, verbose); /* Valid range: - */ limit = zfs_arc_dnode_limit ? zfs_arc_dnode_limit : MIN(zfs_arc_dnode_limit_percent, 100) * arc_meta_limit / 100; if ((limit != arc_dnode_size_limit) && (limit >= arc_meta_min) && (limit <= arc_meta_limit)) arc_dnode_size_limit = limit; WARN_IF_TUNING_IGNORED(zfs_arc_dnode_limit, arc_dnode_size_limit, verbose); /* Valid range: 1 - N */ if (zfs_arc_grow_retry) arc_grow_retry = zfs_arc_grow_retry; /* Valid range: 1 - N */ if (zfs_arc_shrink_shift) { arc_shrink_shift = zfs_arc_shrink_shift; arc_no_grow_shift = MIN(arc_no_grow_shift, arc_shrink_shift -1); } /* Valid range: 1 - N */ if (zfs_arc_p_min_shift) arc_p_min_shift = zfs_arc_p_min_shift; /* Valid range: 1 - N ms */ if (zfs_arc_min_prefetch_ms) arc_min_prefetch_ms = zfs_arc_min_prefetch_ms; /* Valid range: 1 - N ms */ if (zfs_arc_min_prescient_prefetch_ms) { arc_min_prescient_prefetch_ms = zfs_arc_min_prescient_prefetch_ms; } /* Valid range: 0 - 100 */ if (zfs_arc_lotsfree_percent <= 100) arc_lotsfree_percent = zfs_arc_lotsfree_percent; WARN_IF_TUNING_IGNORED(zfs_arc_lotsfree_percent, arc_lotsfree_percent, verbose); /* Valid range: 0 - */ if ((zfs_arc_sys_free) && (zfs_arc_sys_free != arc_sys_free)) arc_sys_free = MIN(zfs_arc_sys_free, allmem); WARN_IF_TUNING_IGNORED(zfs_arc_sys_free, arc_sys_free, verbose); } static void arc_state_multilist_init(multilist_t *ml, multilist_sublist_index_func_t *index_func, int *maxcountp) { multilist_create(ml, sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), index_func); *maxcountp = MAX(*maxcountp, multilist_get_num_sublists(ml)); } static void arc_state_init(void) { int num_sublists = 0; arc_state_multilist_init(&arc_mru->arcs_list[ARC_BUFC_METADATA], arc_state_multilist_index_func, &num_sublists); arc_state_multilist_init(&arc_mru->arcs_list[ARC_BUFC_DATA], arc_state_multilist_index_func, &num_sublists); arc_state_multilist_init(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA], arc_state_multilist_index_func, &num_sublists); arc_state_multilist_init(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA], arc_state_multilist_index_func, &num_sublists); arc_state_multilist_init(&arc_mfu->arcs_list[ARC_BUFC_METADATA], arc_state_multilist_index_func, &num_sublists); arc_state_multilist_init(&arc_mfu->arcs_list[ARC_BUFC_DATA], arc_state_multilist_index_func, &num_sublists); arc_state_multilist_init(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA], arc_state_multilist_index_func, &num_sublists); arc_state_multilist_init(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA], arc_state_multilist_index_func, &num_sublists); /* * L2 headers should never be on the L2 state list since they don't * have L1 headers allocated. Special index function asserts that. */ arc_state_multilist_init(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA], arc_state_l2c_multilist_index_func, &num_sublists); arc_state_multilist_init(&arc_l2c_only->arcs_list[ARC_BUFC_DATA], arc_state_l2c_multilist_index_func, &num_sublists); /* * Keep track of the number of markers needed to reclaim buffers from * any ARC state. The markers will be pre-allocated so as to minimize * the number of memory allocations performed by the eviction thread. */ arc_state_evict_marker_count = num_sublists; zfs_refcount_create(&arc_anon->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_create(&arc_anon->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_create(&arc_mru->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_create(&arc_mru->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_create(&arc_anon->arcs_size); zfs_refcount_create(&arc_mru->arcs_size); zfs_refcount_create(&arc_mru_ghost->arcs_size); zfs_refcount_create(&arc_mfu->arcs_size); zfs_refcount_create(&arc_mfu_ghost->arcs_size); zfs_refcount_create(&arc_l2c_only->arcs_size); wmsum_init(&arc_sums.arcstat_hits, 0); wmsum_init(&arc_sums.arcstat_misses, 0); wmsum_init(&arc_sums.arcstat_demand_data_hits, 0); wmsum_init(&arc_sums.arcstat_demand_data_misses, 0); wmsum_init(&arc_sums.arcstat_demand_metadata_hits, 0); wmsum_init(&arc_sums.arcstat_demand_metadata_misses, 0); wmsum_init(&arc_sums.arcstat_prefetch_data_hits, 0); wmsum_init(&arc_sums.arcstat_prefetch_data_misses, 0); wmsum_init(&arc_sums.arcstat_prefetch_metadata_hits, 0); wmsum_init(&arc_sums.arcstat_prefetch_metadata_misses, 0); wmsum_init(&arc_sums.arcstat_mru_hits, 0); wmsum_init(&arc_sums.arcstat_mru_ghost_hits, 0); wmsum_init(&arc_sums.arcstat_mfu_hits, 0); wmsum_init(&arc_sums.arcstat_mfu_ghost_hits, 0); wmsum_init(&arc_sums.arcstat_deleted, 0); wmsum_init(&arc_sums.arcstat_mutex_miss, 0); wmsum_init(&arc_sums.arcstat_access_skip, 0); wmsum_init(&arc_sums.arcstat_evict_skip, 0); wmsum_init(&arc_sums.arcstat_evict_not_enough, 0); wmsum_init(&arc_sums.arcstat_evict_l2_cached, 0); wmsum_init(&arc_sums.arcstat_evict_l2_eligible, 0); wmsum_init(&arc_sums.arcstat_evict_l2_eligible_mfu, 0); wmsum_init(&arc_sums.arcstat_evict_l2_eligible_mru, 0); wmsum_init(&arc_sums.arcstat_evict_l2_ineligible, 0); wmsum_init(&arc_sums.arcstat_evict_l2_skip, 0); wmsum_init(&arc_sums.arcstat_hash_collisions, 0); wmsum_init(&arc_sums.arcstat_hash_chains, 0); aggsum_init(&arc_sums.arcstat_size, 0); wmsum_init(&arc_sums.arcstat_compressed_size, 0); wmsum_init(&arc_sums.arcstat_uncompressed_size, 0); wmsum_init(&arc_sums.arcstat_overhead_size, 0); wmsum_init(&arc_sums.arcstat_hdr_size, 0); wmsum_init(&arc_sums.arcstat_data_size, 0); wmsum_init(&arc_sums.arcstat_metadata_size, 0); wmsum_init(&arc_sums.arcstat_dbuf_size, 0); aggsum_init(&arc_sums.arcstat_dnode_size, 0); wmsum_init(&arc_sums.arcstat_bonus_size, 0); wmsum_init(&arc_sums.arcstat_l2_hits, 0); wmsum_init(&arc_sums.arcstat_l2_misses, 0); wmsum_init(&arc_sums.arcstat_l2_prefetch_asize, 0); wmsum_init(&arc_sums.arcstat_l2_mru_asize, 0); wmsum_init(&arc_sums.arcstat_l2_mfu_asize, 0); wmsum_init(&arc_sums.arcstat_l2_bufc_data_asize, 0); wmsum_init(&arc_sums.arcstat_l2_bufc_metadata_asize, 0); wmsum_init(&arc_sums.arcstat_l2_feeds, 0); wmsum_init(&arc_sums.arcstat_l2_rw_clash, 0); wmsum_init(&arc_sums.arcstat_l2_read_bytes, 0); wmsum_init(&arc_sums.arcstat_l2_write_bytes, 0); wmsum_init(&arc_sums.arcstat_l2_writes_sent, 0); wmsum_init(&arc_sums.arcstat_l2_writes_done, 0); wmsum_init(&arc_sums.arcstat_l2_writes_error, 0); wmsum_init(&arc_sums.arcstat_l2_writes_lock_retry, 0); wmsum_init(&arc_sums.arcstat_l2_evict_lock_retry, 0); wmsum_init(&arc_sums.arcstat_l2_evict_reading, 0); wmsum_init(&arc_sums.arcstat_l2_evict_l1cached, 0); wmsum_init(&arc_sums.arcstat_l2_free_on_write, 0); wmsum_init(&arc_sums.arcstat_l2_abort_lowmem, 0); wmsum_init(&arc_sums.arcstat_l2_cksum_bad, 0); wmsum_init(&arc_sums.arcstat_l2_io_error, 0); wmsum_init(&arc_sums.arcstat_l2_lsize, 0); wmsum_init(&arc_sums.arcstat_l2_psize, 0); aggsum_init(&arc_sums.arcstat_l2_hdr_size, 0); wmsum_init(&arc_sums.arcstat_l2_log_blk_writes, 0); wmsum_init(&arc_sums.arcstat_l2_log_blk_asize, 0); wmsum_init(&arc_sums.arcstat_l2_log_blk_count, 0); wmsum_init(&arc_sums.arcstat_l2_rebuild_success, 0); wmsum_init(&arc_sums.arcstat_l2_rebuild_abort_unsupported, 0); wmsum_init(&arc_sums.arcstat_l2_rebuild_abort_io_errors, 0); wmsum_init(&arc_sums.arcstat_l2_rebuild_abort_dh_errors, 0); wmsum_init(&arc_sums.arcstat_l2_rebuild_abort_cksum_lb_errors, 0); wmsum_init(&arc_sums.arcstat_l2_rebuild_abort_lowmem, 0); wmsum_init(&arc_sums.arcstat_l2_rebuild_size, 0); wmsum_init(&arc_sums.arcstat_l2_rebuild_asize, 0); wmsum_init(&arc_sums.arcstat_l2_rebuild_bufs, 0); wmsum_init(&arc_sums.arcstat_l2_rebuild_bufs_precached, 0); wmsum_init(&arc_sums.arcstat_l2_rebuild_log_blks, 0); wmsum_init(&arc_sums.arcstat_memory_throttle_count, 0); wmsum_init(&arc_sums.arcstat_memory_direct_count, 0); wmsum_init(&arc_sums.arcstat_memory_indirect_count, 0); wmsum_init(&arc_sums.arcstat_prune, 0); aggsum_init(&arc_sums.arcstat_meta_used, 0); wmsum_init(&arc_sums.arcstat_async_upgrade_sync, 0); wmsum_init(&arc_sums.arcstat_demand_hit_predictive_prefetch, 0); wmsum_init(&arc_sums.arcstat_demand_hit_prescient_prefetch, 0); wmsum_init(&arc_sums.arcstat_raw_size, 0); wmsum_init(&arc_sums.arcstat_cached_only_in_progress, 0); wmsum_init(&arc_sums.arcstat_abd_chunk_waste_size, 0); arc_anon->arcs_state = ARC_STATE_ANON; arc_mru->arcs_state = ARC_STATE_MRU; arc_mru_ghost->arcs_state = ARC_STATE_MRU_GHOST; arc_mfu->arcs_state = ARC_STATE_MFU; arc_mfu_ghost->arcs_state = ARC_STATE_MFU_GHOST; arc_l2c_only->arcs_state = ARC_STATE_L2C_ONLY; } static void arc_state_fini(void) { zfs_refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]); zfs_refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]); zfs_refcount_destroy(&arc_anon->arcs_size); zfs_refcount_destroy(&arc_mru->arcs_size); zfs_refcount_destroy(&arc_mru_ghost->arcs_size); zfs_refcount_destroy(&arc_mfu->arcs_size); zfs_refcount_destroy(&arc_mfu_ghost->arcs_size); zfs_refcount_destroy(&arc_l2c_only->arcs_size); multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]); multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]); multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]); multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]); multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]); multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]); multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]); multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]); multilist_destroy(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA]); multilist_destroy(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]); wmsum_fini(&arc_sums.arcstat_hits); wmsum_fini(&arc_sums.arcstat_misses); wmsum_fini(&arc_sums.arcstat_demand_data_hits); wmsum_fini(&arc_sums.arcstat_demand_data_misses); wmsum_fini(&arc_sums.arcstat_demand_metadata_hits); wmsum_fini(&arc_sums.arcstat_demand_metadata_misses); wmsum_fini(&arc_sums.arcstat_prefetch_data_hits); wmsum_fini(&arc_sums.arcstat_prefetch_data_misses); wmsum_fini(&arc_sums.arcstat_prefetch_metadata_hits); wmsum_fini(&arc_sums.arcstat_prefetch_metadata_misses); wmsum_fini(&arc_sums.arcstat_mru_hits); wmsum_fini(&arc_sums.arcstat_mru_ghost_hits); wmsum_fini(&arc_sums.arcstat_mfu_hits); wmsum_fini(&arc_sums.arcstat_mfu_ghost_hits); wmsum_fini(&arc_sums.arcstat_deleted); wmsum_fini(&arc_sums.arcstat_mutex_miss); wmsum_fini(&arc_sums.arcstat_access_skip); wmsum_fini(&arc_sums.arcstat_evict_skip); wmsum_fini(&arc_sums.arcstat_evict_not_enough); wmsum_fini(&arc_sums.arcstat_evict_l2_cached); wmsum_fini(&arc_sums.arcstat_evict_l2_eligible); wmsum_fini(&arc_sums.arcstat_evict_l2_eligible_mfu); wmsum_fini(&arc_sums.arcstat_evict_l2_eligible_mru); wmsum_fini(&arc_sums.arcstat_evict_l2_ineligible); wmsum_fini(&arc_sums.arcstat_evict_l2_skip); wmsum_fini(&arc_sums.arcstat_hash_collisions); wmsum_fini(&arc_sums.arcstat_hash_chains); aggsum_fini(&arc_sums.arcstat_size); wmsum_fini(&arc_sums.arcstat_compressed_size); wmsum_fini(&arc_sums.arcstat_uncompressed_size); wmsum_fini(&arc_sums.arcstat_overhead_size); wmsum_fini(&arc_sums.arcstat_hdr_size); wmsum_fini(&arc_sums.arcstat_data_size); wmsum_fini(&arc_sums.arcstat_metadata_size); wmsum_fini(&arc_sums.arcstat_dbuf_size); aggsum_fini(&arc_sums.arcstat_dnode_size); wmsum_fini(&arc_sums.arcstat_bonus_size); wmsum_fini(&arc_sums.arcstat_l2_hits); wmsum_fini(&arc_sums.arcstat_l2_misses); wmsum_fini(&arc_sums.arcstat_l2_prefetch_asize); wmsum_fini(&arc_sums.arcstat_l2_mru_asize); wmsum_fini(&arc_sums.arcstat_l2_mfu_asize); wmsum_fini(&arc_sums.arcstat_l2_bufc_data_asize); wmsum_fini(&arc_sums.arcstat_l2_bufc_metadata_asize); wmsum_fini(&arc_sums.arcstat_l2_feeds); wmsum_fini(&arc_sums.arcstat_l2_rw_clash); wmsum_fini(&arc_sums.arcstat_l2_read_bytes); wmsum_fini(&arc_sums.arcstat_l2_write_bytes); wmsum_fini(&arc_sums.arcstat_l2_writes_sent); wmsum_fini(&arc_sums.arcstat_l2_writes_done); wmsum_fini(&arc_sums.arcstat_l2_writes_error); wmsum_fini(&arc_sums.arcstat_l2_writes_lock_retry); wmsum_fini(&arc_sums.arcstat_l2_evict_lock_retry); wmsum_fini(&arc_sums.arcstat_l2_evict_reading); wmsum_fini(&arc_sums.arcstat_l2_evict_l1cached); wmsum_fini(&arc_sums.arcstat_l2_free_on_write); wmsum_fini(&arc_sums.arcstat_l2_abort_lowmem); wmsum_fini(&arc_sums.arcstat_l2_cksum_bad); wmsum_fini(&arc_sums.arcstat_l2_io_error); wmsum_fini(&arc_sums.arcstat_l2_lsize); wmsum_fini(&arc_sums.arcstat_l2_psize); aggsum_fini(&arc_sums.arcstat_l2_hdr_size); wmsum_fini(&arc_sums.arcstat_l2_log_blk_writes); wmsum_fini(&arc_sums.arcstat_l2_log_blk_asize); wmsum_fini(&arc_sums.arcstat_l2_log_blk_count); wmsum_fini(&arc_sums.arcstat_l2_rebuild_success); wmsum_fini(&arc_sums.arcstat_l2_rebuild_abort_unsupported); wmsum_fini(&arc_sums.arcstat_l2_rebuild_abort_io_errors); wmsum_fini(&arc_sums.arcstat_l2_rebuild_abort_dh_errors); wmsum_fini(&arc_sums.arcstat_l2_rebuild_abort_cksum_lb_errors); wmsum_fini(&arc_sums.arcstat_l2_rebuild_abort_lowmem); wmsum_fini(&arc_sums.arcstat_l2_rebuild_size); wmsum_fini(&arc_sums.arcstat_l2_rebuild_asize); wmsum_fini(&arc_sums.arcstat_l2_rebuild_bufs); wmsum_fini(&arc_sums.arcstat_l2_rebuild_bufs_precached); wmsum_fini(&arc_sums.arcstat_l2_rebuild_log_blks); wmsum_fini(&arc_sums.arcstat_memory_throttle_count); wmsum_fini(&arc_sums.arcstat_memory_direct_count); wmsum_fini(&arc_sums.arcstat_memory_indirect_count); wmsum_fini(&arc_sums.arcstat_prune); aggsum_fini(&arc_sums.arcstat_meta_used); wmsum_fini(&arc_sums.arcstat_async_upgrade_sync); wmsum_fini(&arc_sums.arcstat_demand_hit_predictive_prefetch); wmsum_fini(&arc_sums.arcstat_demand_hit_prescient_prefetch); wmsum_fini(&arc_sums.arcstat_raw_size); wmsum_fini(&arc_sums.arcstat_cached_only_in_progress); wmsum_fini(&arc_sums.arcstat_abd_chunk_waste_size); } uint64_t arc_target_bytes(void) { return (arc_c); } void arc_set_limits(uint64_t allmem) { /* Set min cache to 1/32 of all memory, or 32MB, whichever is more. */ arc_c_min = MAX(allmem / 32, 2ULL << SPA_MAXBLOCKSHIFT); /* How to set default max varies by platform. */ arc_c_max = arc_default_max(arc_c_min, allmem); } void arc_init(void) { uint64_t percent, allmem = arc_all_memory(); mutex_init(&arc_evict_lock, NULL, MUTEX_DEFAULT, NULL); list_create(&arc_evict_waiters, sizeof (arc_evict_waiter_t), offsetof(arc_evict_waiter_t, aew_node)); arc_min_prefetch_ms = 1000; arc_min_prescient_prefetch_ms = 6000; #if defined(_KERNEL) arc_lowmem_init(); #endif arc_set_limits(allmem); #ifdef _KERNEL /* * If zfs_arc_max is non-zero at init, meaning it was set in the kernel * environment before the module was loaded, don't block setting the * maximum because it is less than arc_c_min, instead, reset arc_c_min * to a lower value. * zfs_arc_min will be handled by arc_tuning_update(). */ if (zfs_arc_max != 0 && zfs_arc_max >= MIN_ARC_MAX && zfs_arc_max < allmem) { arc_c_max = zfs_arc_max; if (arc_c_min >= arc_c_max) { arc_c_min = MAX(zfs_arc_max / 2, 2ULL << SPA_MAXBLOCKSHIFT); } } #else /* * In userland, there's only the memory pressure that we artificially * create (see arc_available_memory()). Don't let arc_c get too * small, because it can cause transactions to be larger than * arc_c, causing arc_tempreserve_space() to fail. */ arc_c_min = MAX(arc_c_max / 2, 2ULL << SPA_MAXBLOCKSHIFT); #endif arc_c = arc_c_min; arc_p = (arc_c >> 1); /* Set min to 1/2 of arc_c_min */ arc_meta_min = 1ULL << SPA_MAXBLOCKSHIFT; /* * Set arc_meta_limit to a percent of arc_c_max with a floor of * arc_meta_min, and a ceiling of arc_c_max. */ percent = MIN(zfs_arc_meta_limit_percent, 100); arc_meta_limit = MAX(arc_meta_min, (percent * arc_c_max) / 100); percent = MIN(zfs_arc_dnode_limit_percent, 100); arc_dnode_size_limit = (percent * arc_meta_limit) / 100; /* Apply user specified tunings */ arc_tuning_update(B_TRUE); /* if kmem_flags are set, lets try to use less memory */ if (kmem_debugging()) arc_c = arc_c / 2; if (arc_c < arc_c_min) arc_c = arc_c_min; arc_register_hotplug(); arc_state_init(); buf_init(); list_create(&arc_prune_list, sizeof (arc_prune_t), offsetof(arc_prune_t, p_node)); mutex_init(&arc_prune_mtx, NULL, MUTEX_DEFAULT, NULL); arc_prune_taskq = taskq_create("arc_prune", zfs_arc_prune_task_threads, defclsyspri, 100, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC); arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED, sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); if (arc_ksp != NULL) { arc_ksp->ks_data = &arc_stats; arc_ksp->ks_update = arc_kstat_update; kstat_install(arc_ksp); } arc_state_evict_markers = arc_state_alloc_markers(arc_state_evict_marker_count); arc_evict_zthr = zthr_create("arc_evict", arc_evict_cb_check, arc_evict_cb, NULL, defclsyspri); arc_reap_zthr = zthr_create_timer("arc_reap", arc_reap_cb_check, arc_reap_cb, NULL, SEC2NSEC(1), minclsyspri); arc_warm = B_FALSE; /* * Calculate maximum amount of dirty data per pool. * * If it has been set by a module parameter, take that. * Otherwise, use a percentage of physical memory defined by * zfs_dirty_data_max_percent (default 10%) with a cap at * zfs_dirty_data_max_max (default 4G or 25% of physical memory). */ #ifdef __LP64__ if (zfs_dirty_data_max_max == 0) zfs_dirty_data_max_max = MIN(4ULL * 1024 * 1024 * 1024, allmem * zfs_dirty_data_max_max_percent / 100); #else if (zfs_dirty_data_max_max == 0) zfs_dirty_data_max_max = MIN(1ULL * 1024 * 1024 * 1024, allmem * zfs_dirty_data_max_max_percent / 100); #endif if (zfs_dirty_data_max == 0) { zfs_dirty_data_max = allmem * zfs_dirty_data_max_percent / 100; zfs_dirty_data_max = MIN(zfs_dirty_data_max, zfs_dirty_data_max_max); } if (zfs_wrlog_data_max == 0) { /* * dp_wrlog_total is reduced for each txg at the end of * spa_sync(). However, dp_dirty_total is reduced every time * a block is written out. Thus under normal operation, * dp_wrlog_total could grow 2 times as big as * zfs_dirty_data_max. */ zfs_wrlog_data_max = zfs_dirty_data_max * 2; } } void arc_fini(void) { arc_prune_t *p; #ifdef _KERNEL arc_lowmem_fini(); #endif /* _KERNEL */ /* Use B_TRUE to ensure *all* buffers are evicted */ arc_flush(NULL, B_TRUE); if (arc_ksp != NULL) { kstat_delete(arc_ksp); arc_ksp = NULL; } taskq_wait(arc_prune_taskq); taskq_destroy(arc_prune_taskq); mutex_enter(&arc_prune_mtx); while ((p = list_head(&arc_prune_list)) != NULL) { list_remove(&arc_prune_list, p); zfs_refcount_remove(&p->p_refcnt, &arc_prune_list); zfs_refcount_destroy(&p->p_refcnt); kmem_free(p, sizeof (*p)); } mutex_exit(&arc_prune_mtx); list_destroy(&arc_prune_list); mutex_destroy(&arc_prune_mtx); (void) zthr_cancel(arc_evict_zthr); (void) zthr_cancel(arc_reap_zthr); arc_state_free_markers(arc_state_evict_markers, arc_state_evict_marker_count); mutex_destroy(&arc_evict_lock); list_destroy(&arc_evict_waiters); /* * Free any buffers that were tagged for destruction. This needs * to occur before arc_state_fini() runs and destroys the aggsum * values which are updated when freeing scatter ABDs. */ l2arc_do_free_on_write(); /* * buf_fini() must proceed arc_state_fini() because buf_fin() may * trigger the release of kmem magazines, which can callback to * arc_space_return() which accesses aggsums freed in act_state_fini(). */ buf_fini(); arc_state_fini(); arc_unregister_hotplug(); /* * We destroy the zthrs after all the ARC state has been * torn down to avoid the case of them receiving any * wakeup() signals after they are destroyed. */ zthr_destroy(arc_evict_zthr); zthr_destroy(arc_reap_zthr); ASSERT0(arc_loaned_bytes); } /* * Level 2 ARC * * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk. * It uses dedicated storage devices to hold cached data, which are populated * using large infrequent writes. The main role of this cache is to boost * the performance of random read workloads. The intended L2ARC devices * include short-stroked disks, solid state disks, and other media with * substantially faster read latency than disk. * * +-----------------------+ * | ARC | * +-----------------------+ * | ^ ^ * | | | * l2arc_feed_thread() arc_read() * | | | * | l2arc read | * V | | * +---------------+ | * | L2ARC | | * +---------------+ | * | ^ | * l2arc_write() | | * | | | * V | | * +-------+ +-------+ * | vdev | | vdev | * | cache | | cache | * +-------+ +-------+ * +=========+ .-----. * : L2ARC : |-_____-| * : devices : | Disks | * +=========+ `-_____-' * * Read requests are satisfied from the following sources, in order: * * 1) ARC * 2) vdev cache of L2ARC devices * 3) L2ARC devices * 4) vdev cache of disks * 5) disks * * Some L2ARC device types exhibit extremely slow write performance. * To accommodate for this there are some significant differences between * the L2ARC and traditional cache design: * * 1. There is no eviction path from the ARC to the L2ARC. Evictions from * the ARC behave as usual, freeing buffers and placing headers on ghost * lists. The ARC does not send buffers to the L2ARC during eviction as * this would add inflated write latencies for all ARC memory pressure. * * 2. The L2ARC attempts to cache data from the ARC before it is evicted. * It does this by periodically scanning buffers from the eviction-end of * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are * not already there. It scans until a headroom of buffers is satisfied, * which itself is a buffer for ARC eviction. If a compressible buffer is * found during scanning and selected for writing to an L2ARC device, we * temporarily boost scanning headroom during the next scan cycle to make * sure we adapt to compression effects (which might significantly reduce * the data volume we write to L2ARC). The thread that does this is * l2arc_feed_thread(), illustrated below; example sizes are included to * provide a better sense of ratio than this diagram: * * head --> tail * +---------------------+----------+ * ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->. # already on L2ARC * +---------------------+----------+ | o L2ARC eligible * ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->| : ARC buffer * +---------------------+----------+ | * 15.9 Gbytes ^ 32 Mbytes | * headroom | * l2arc_feed_thread() * | * l2arc write hand <--[oooo]--' * | 8 Mbyte * | write max * V * +==============================+ * L2ARC dev |####|#|###|###| |####| ... | * +==============================+ * 32 Gbytes * * 3. If an ARC buffer is copied to the L2ARC but then hit instead of * evicted, then the L2ARC has cached a buffer much sooner than it probably * needed to, potentially wasting L2ARC device bandwidth and storage. It is * safe to say that this is an uncommon case, since buffers at the end of * the ARC lists have moved there due to inactivity. * * 4. If the ARC evicts faster than the L2ARC can maintain a headroom, * then the L2ARC simply misses copying some buffers. This serves as a * pressure valve to prevent heavy read workloads from both stalling the ARC * with waits and clogging the L2ARC with writes. This also helps prevent * the potential for the L2ARC to churn if it attempts to cache content too * quickly, such as during backups of the entire pool. * * 5. After system boot and before the ARC has filled main memory, there are * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru * lists can remain mostly static. Instead of searching from tail of these * lists as pictured, the l2arc_feed_thread() will search from the list heads * for eligible buffers, greatly increasing its chance of finding them. * * The L2ARC device write speed is also boosted during this time so that * the L2ARC warms up faster. Since there have been no ARC evictions yet, * there are no L2ARC reads, and no fear of degrading read performance * through increased writes. * * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that * the vdev queue can aggregate them into larger and fewer writes. Each * device is written to in a rotor fashion, sweeping writes through * available space then repeating. * * 7. The L2ARC does not store dirty content. It never needs to flush * write buffers back to disk based storage. * * 8. If an ARC buffer is written (and dirtied) which also exists in the * L2ARC, the now stale L2ARC buffer is immediately dropped. * * The performance of the L2ARC can be tweaked by a number of tunables, which * may be necessary for different workloads: * * l2arc_write_max max write bytes per interval * l2arc_write_boost extra write bytes during device warmup * l2arc_noprefetch skip caching prefetched buffers * l2arc_headroom number of max device writes to precache * l2arc_headroom_boost when we find compressed buffers during ARC * scanning, we multiply headroom by this * percentage factor for the next scan cycle, * since more compressed buffers are likely to * be present * l2arc_feed_secs seconds between L2ARC writing * * Tunables may be removed or added as future performance improvements are * integrated, and also may become zpool properties. * * There are three key functions that control how the L2ARC warms up: * * l2arc_write_eligible() check if a buffer is eligible to cache * l2arc_write_size() calculate how much to write * l2arc_write_interval() calculate sleep delay between writes * * These three functions determine what to write, how much, and how quickly * to send writes. * * L2ARC persistence: * * When writing buffers to L2ARC, we periodically add some metadata to * make sure we can pick them up after reboot, thus dramatically reducing * the impact that any downtime has on the performance of storage systems * with large caches. * * The implementation works fairly simply by integrating the following two * modifications: * * *) When writing to the L2ARC, we occasionally write a "l2arc log block", * which is an additional piece of metadata which describes what's been * written. This allows us to rebuild the arc_buf_hdr_t structures of the * main ARC buffers. There are 2 linked-lists of log blocks headed by * dh_start_lbps[2]. We alternate which chain we append to, so they are * time-wise and offset-wise interleaved, but that is an optimization rather * than for correctness. The log block also includes a pointer to the * previous block in its chain. * * *) We reserve SPA_MINBLOCKSIZE of space at the start of each L2ARC device * for our header bookkeeping purposes. This contains a device header, * which contains our top-level reference structures. We update it each * time we write a new log block, so that we're able to locate it in the * L2ARC device. If this write results in an inconsistent device header * (e.g. due to power failure), we detect this by verifying the header's * checksum and simply fail to reconstruct the L2ARC after reboot. * * Implementation diagram: * * +=== L2ARC device (not to scale) ======================================+ * | ___two newest log block pointers__.__________ | * | / \dh_start_lbps[1] | * | / \ \dh_start_lbps[0]| * |.___/__. V V | * ||L2 dev|....|lb |bufs |lb |bufs |lb |bufs |lb |bufs |lb |---(empty)---| * || hdr| ^ /^ /^ / / | * |+------+ ...--\-------/ \-----/--\------/ / | * | \--------------/ \--------------/ | * +======================================================================+ * * As can be seen on the diagram, rather than using a simple linked list, * we use a pair of linked lists with alternating elements. This is a * performance enhancement due to the fact that we only find out the * address of the next log block access once the current block has been * completely read in. Obviously, this hurts performance, because we'd be * keeping the device's I/O queue at only a 1 operation deep, thus * incurring a large amount of I/O round-trip latency. Having two lists * allows us to fetch two log blocks ahead of where we are currently * rebuilding L2ARC buffers. * * On-device data structures: * * L2ARC device header: l2arc_dev_hdr_phys_t * L2ARC log block: l2arc_log_blk_phys_t * * L2ARC reconstruction: * * When writing data, we simply write in the standard rotary fashion, * evicting buffers as we go and simply writing new data over them (writing * a new log block every now and then). This obviously means that once we * loop around the end of the device, we will start cutting into an already * committed log block (and its referenced data buffers), like so: * * current write head__ __old tail * \ / * V V * <--|bufs |lb |bufs |lb | |bufs |lb |bufs |lb |--> * ^ ^^^^^^^^^___________________________________ * | \ * <> may overwrite this blk and/or its bufs --' * * When importing the pool, we detect this situation and use it to stop * our scanning process (see l2arc_rebuild). * * There is one significant caveat to consider when rebuilding ARC contents * from an L2ARC device: what about invalidated buffers? Given the above * construction, we cannot update blocks which we've already written to amend * them to remove buffers which were invalidated. Thus, during reconstruction, * we might be populating the cache with buffers for data that's not on the * main pool anymore, or may have been overwritten! * * As it turns out, this isn't a problem. Every arc_read request includes * both the DVA and, crucially, the birth TXG of the BP the caller is * looking for. So even if the cache were populated by completely rotten * blocks for data that had been long deleted and/or overwritten, we'll * never actually return bad data from the cache, since the DVA with the * birth TXG uniquely identify a block in space and time - once created, * a block is immutable on disk. The worst thing we have done is wasted * some time and memory at l2arc rebuild to reconstruct outdated ARC * entries that will get dropped from the l2arc as it is being updated * with new blocks. * * L2ARC buffers that have been evicted by l2arc_evict() ahead of the write * hand are not restored. This is done by saving the offset (in bytes) * l2arc_evict() has evicted to in the L2ARC device header and taking it * into account when restoring buffers. */ static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr) { /* * A buffer is *not* eligible for the L2ARC if it: * 1. belongs to a different spa. * 2. is already cached on the L2ARC. * 3. has an I/O in progress (it may be an incomplete read). * 4. is flagged not eligible (zfs property). */ if (hdr->b_spa != spa_guid || HDR_HAS_L2HDR(hdr) || HDR_IO_IN_PROGRESS(hdr) || !HDR_L2CACHE(hdr)) return (B_FALSE); return (B_TRUE); } static uint64_t l2arc_write_size(l2arc_dev_t *dev) { uint64_t size, dev_size, tsize; /* * Make sure our globals have meaningful values in case the user * altered them. */ size = l2arc_write_max; if (size == 0) { cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must " "be greater than zero, resetting it to the default (%d)", L2ARC_WRITE_SIZE); size = l2arc_write_max = L2ARC_WRITE_SIZE; } if (arc_warm == B_FALSE) size += l2arc_write_boost; /* * Make sure the write size does not exceed the size of the cache * device. This is important in l2arc_evict(), otherwise infinite * iteration can occur. */ dev_size = dev->l2ad_end - dev->l2ad_start; tsize = size + l2arc_log_blk_overhead(size, dev); if (dev->l2ad_vdev->vdev_has_trim && l2arc_trim_ahead > 0) tsize += MAX(64 * 1024 * 1024, (tsize * l2arc_trim_ahead) / 100); if (tsize >= dev_size) { cmn_err(CE_NOTE, "l2arc_write_max or l2arc_write_boost " "plus the overhead of log blocks (persistent L2ARC, " "%llu bytes) exceeds the size of the cache device " "(guid %llu), resetting them to the default (%d)", (u_longlong_t)l2arc_log_blk_overhead(size, dev), (u_longlong_t)dev->l2ad_vdev->vdev_guid, L2ARC_WRITE_SIZE); size = l2arc_write_max = l2arc_write_boost = L2ARC_WRITE_SIZE; if (arc_warm == B_FALSE) size += l2arc_write_boost; } return (size); } static clock_t l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote) { clock_t interval, next, now; /* * If the ARC lists are busy, increase our write rate; if the * lists are stale, idle back. This is achieved by checking * how much we previously wrote - if it was more than half of * what we wanted, schedule the next write much sooner. */ if (l2arc_feed_again && wrote > (wanted / 2)) interval = (hz * l2arc_feed_min_ms) / 1000; else interval = hz * l2arc_feed_secs; now = ddi_get_lbolt(); next = MAX(now, MIN(now + interval, began + interval)); return (next); } /* * Cycle through L2ARC devices. This is how L2ARC load balances. * If a device is returned, this also returns holding the spa config lock. */ static l2arc_dev_t * l2arc_dev_get_next(void) { l2arc_dev_t *first, *next = NULL; /* * Lock out the removal of spas (spa_namespace_lock), then removal * of cache devices (l2arc_dev_mtx). Once a device has been selected, * both locks will be dropped and a spa config lock held instead. */ mutex_enter(&spa_namespace_lock); mutex_enter(&l2arc_dev_mtx); /* if there are no vdevs, there is nothing to do */ if (l2arc_ndev == 0) goto out; first = NULL; next = l2arc_dev_last; do { /* loop around the list looking for a non-faulted vdev */ if (next == NULL) { next = list_head(l2arc_dev_list); } else { next = list_next(l2arc_dev_list, next); if (next == NULL) next = list_head(l2arc_dev_list); } /* if we have come back to the start, bail out */ if (first == NULL) first = next; else if (next == first) break; ASSERT3P(next, !=, NULL); } while (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild || next->l2ad_trim_all); /* if we were unable to find any usable vdevs, return NULL */ if (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild || next->l2ad_trim_all) next = NULL; l2arc_dev_last = next; out: mutex_exit(&l2arc_dev_mtx); /* * Grab the config lock to prevent the 'next' device from being * removed while we are writing to it. */ if (next != NULL) spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER); mutex_exit(&spa_namespace_lock); return (next); } /* * Free buffers that were tagged for destruction. */ static void l2arc_do_free_on_write(void) { list_t *buflist; l2arc_data_free_t *df, *df_prev; mutex_enter(&l2arc_free_on_write_mtx); buflist = l2arc_free_on_write; for (df = list_tail(buflist); df; df = df_prev) { df_prev = list_prev(buflist, df); ASSERT3P(df->l2df_abd, !=, NULL); abd_free(df->l2df_abd); list_remove(buflist, df); kmem_free(df, sizeof (l2arc_data_free_t)); } mutex_exit(&l2arc_free_on_write_mtx); } /* * A write to a cache device has completed. Update all headers to allow * reads from these buffers to begin. */ static void l2arc_write_done(zio_t *zio) { l2arc_write_callback_t *cb; l2arc_lb_abd_buf_t *abd_buf; l2arc_lb_ptr_buf_t *lb_ptr_buf; l2arc_dev_t *dev; l2arc_dev_hdr_phys_t *l2dhdr; list_t *buflist; arc_buf_hdr_t *head, *hdr, *hdr_prev; kmutex_t *hash_lock; int64_t bytes_dropped = 0; cb = zio->io_private; ASSERT3P(cb, !=, NULL); dev = cb->l2wcb_dev; l2dhdr = dev->l2ad_dev_hdr; ASSERT3P(dev, !=, NULL); head = cb->l2wcb_head; ASSERT3P(head, !=, NULL); buflist = &dev->l2ad_buflist; ASSERT3P(buflist, !=, NULL); DTRACE_PROBE2(l2arc__iodone, zio_t *, zio, l2arc_write_callback_t *, cb); /* * All writes completed, or an error was hit. */ top: mutex_enter(&dev->l2ad_mtx); for (hdr = list_prev(buflist, head); hdr; hdr = hdr_prev) { hdr_prev = list_prev(buflist, hdr); hash_lock = HDR_LOCK(hdr); /* * We cannot use mutex_enter or else we can deadlock * with l2arc_write_buffers (due to swapping the order * the hash lock and l2ad_mtx are taken). */ if (!mutex_tryenter(hash_lock)) { /* * Missed the hash lock. We must retry so we * don't leave the ARC_FLAG_L2_WRITING bit set. */ ARCSTAT_BUMP(arcstat_l2_writes_lock_retry); /* * We don't want to rescan the headers we've * already marked as having been written out, so * we reinsert the head node so we can pick up * where we left off. */ list_remove(buflist, head); list_insert_after(buflist, hdr, head); mutex_exit(&dev->l2ad_mtx); /* * We wait for the hash lock to become available * to try and prevent busy waiting, and increase * the chance we'll be able to acquire the lock * the next time around. */ mutex_enter(hash_lock); mutex_exit(hash_lock); goto top; } /* * We could not have been moved into the arc_l2c_only * state while in-flight due to our ARC_FLAG_L2_WRITING * bit being set. Let's just ensure that's being enforced. */ ASSERT(HDR_HAS_L1HDR(hdr)); /* * Skipped - drop L2ARC entry and mark the header as no * longer L2 eligibile. */ if (zio->io_error != 0) { /* * Error - drop L2ARC entry. */ list_remove(buflist, hdr); arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR); uint64_t psize = HDR_GET_PSIZE(hdr); l2arc_hdr_arcstats_decrement(hdr); bytes_dropped += vdev_psize_to_asize(dev->l2ad_vdev, psize); (void) zfs_refcount_remove_many(&dev->l2ad_alloc, arc_hdr_size(hdr), hdr); } /* * Allow ARC to begin reads and ghost list evictions to * this L2ARC entry. */ arc_hdr_clear_flags(hdr, ARC_FLAG_L2_WRITING); mutex_exit(hash_lock); } /* * Free the allocated abd buffers for writing the log blocks. * If the zio failed reclaim the allocated space and remove the * pointers to these log blocks from the log block pointer list * of the L2ARC device. */ while ((abd_buf = list_remove_tail(&cb->l2wcb_abd_list)) != NULL) { abd_free(abd_buf->abd); zio_buf_free(abd_buf, sizeof (*abd_buf)); if (zio->io_error != 0) { lb_ptr_buf = list_remove_head(&dev->l2ad_lbptr_list); /* * L2BLK_GET_PSIZE returns aligned size for log * blocks. */ uint64_t asize = L2BLK_GET_PSIZE((lb_ptr_buf->lb_ptr)->lbp_prop); bytes_dropped += asize; ARCSTAT_INCR(arcstat_l2_log_blk_asize, -asize); ARCSTAT_BUMPDOWN(arcstat_l2_log_blk_count); zfs_refcount_remove_many(&dev->l2ad_lb_asize, asize, lb_ptr_buf); zfs_refcount_remove(&dev->l2ad_lb_count, lb_ptr_buf); kmem_free(lb_ptr_buf->lb_ptr, sizeof (l2arc_log_blkptr_t)); kmem_free(lb_ptr_buf, sizeof (l2arc_lb_ptr_buf_t)); } } list_destroy(&cb->l2wcb_abd_list); if (zio->io_error != 0) { ARCSTAT_BUMP(arcstat_l2_writes_error); /* * Restore the lbps array in the header to its previous state. * If the list of log block pointers is empty, zero out the * log block pointers in the device header. */ lb_ptr_buf = list_head(&dev->l2ad_lbptr_list); for (int i = 0; i < 2; i++) { if (lb_ptr_buf == NULL) { /* * If the list is empty zero out the device * header. Otherwise zero out the second log * block pointer in the header. */ if (i == 0) { memset(l2dhdr, 0, dev->l2ad_dev_hdr_asize); } else { memset(&l2dhdr->dh_start_lbps[i], 0, sizeof (l2arc_log_blkptr_t)); } break; } memcpy(&l2dhdr->dh_start_lbps[i], lb_ptr_buf->lb_ptr, sizeof (l2arc_log_blkptr_t)); lb_ptr_buf = list_next(&dev->l2ad_lbptr_list, lb_ptr_buf); } } ARCSTAT_BUMP(arcstat_l2_writes_done); list_remove(buflist, head); ASSERT(!HDR_HAS_L1HDR(head)); kmem_cache_free(hdr_l2only_cache, head); mutex_exit(&dev->l2ad_mtx); ASSERT(dev->l2ad_vdev != NULL); vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0); l2arc_do_free_on_write(); kmem_free(cb, sizeof (l2arc_write_callback_t)); } static int l2arc_untransform(zio_t *zio, l2arc_read_callback_t *cb) { int ret; spa_t *spa = zio->io_spa; arc_buf_hdr_t *hdr = cb->l2rcb_hdr; blkptr_t *bp = zio->io_bp; uint8_t salt[ZIO_DATA_SALT_LEN]; uint8_t iv[ZIO_DATA_IV_LEN]; uint8_t mac[ZIO_DATA_MAC_LEN]; boolean_t no_crypt = B_FALSE; /* * ZIL data is never be written to the L2ARC, so we don't need * special handling for its unique MAC storage. */ ASSERT3U(BP_GET_TYPE(bp), !=, DMU_OT_INTENT_LOG); ASSERT(MUTEX_HELD(HDR_LOCK(hdr))); ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); /* * If the data was encrypted, decrypt it now. Note that * we must check the bp here and not the hdr, since the * hdr does not have its encryption parameters updated * until arc_read_done(). */ if (BP_IS_ENCRYPTED(bp)) { abd_t *eabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr, ARC_HDR_DO_ADAPT | ARC_HDR_USE_RESERVE); zio_crypt_decode_params_bp(bp, salt, iv); zio_crypt_decode_mac_bp(bp, mac); ret = spa_do_crypt_abd(B_FALSE, spa, &cb->l2rcb_zb, BP_GET_TYPE(bp), BP_GET_DEDUP(bp), BP_SHOULD_BYTESWAP(bp), salt, iv, mac, HDR_GET_PSIZE(hdr), eabd, hdr->b_l1hdr.b_pabd, &no_crypt); if (ret != 0) { arc_free_data_abd(hdr, eabd, arc_hdr_size(hdr), hdr); goto error; } /* * If we actually performed decryption, replace b_pabd * with the decrypted data. Otherwise we can just throw * our decryption buffer away. */ if (!no_crypt) { arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd, arc_hdr_size(hdr), hdr); hdr->b_l1hdr.b_pabd = eabd; zio->io_abd = eabd; } else { arc_free_data_abd(hdr, eabd, arc_hdr_size(hdr), hdr); } } /* * If the L2ARC block was compressed, but ARC compression * is disabled we decompress the data into a new buffer and * replace the existing data. */ if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && !HDR_COMPRESSION_ENABLED(hdr)) { abd_t *cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr, ARC_HDR_DO_ADAPT | ARC_HDR_USE_RESERVE); void *tmp = abd_borrow_buf(cabd, arc_hdr_size(hdr)); ret = zio_decompress_data(HDR_GET_COMPRESS(hdr), hdr->b_l1hdr.b_pabd, tmp, HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr), &hdr->b_complevel); if (ret != 0) { abd_return_buf_copy(cabd, tmp, arc_hdr_size(hdr)); arc_free_data_abd(hdr, cabd, arc_hdr_size(hdr), hdr); goto error; } abd_return_buf_copy(cabd, tmp, arc_hdr_size(hdr)); arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd, arc_hdr_size(hdr), hdr); hdr->b_l1hdr.b_pabd = cabd; zio->io_abd = cabd; zio->io_size = HDR_GET_LSIZE(hdr); } return (0); error: return (ret); } /* * A read to a cache device completed. Validate buffer contents before * handing over to the regular ARC routines. */ static void l2arc_read_done(zio_t *zio) { int tfm_error = 0; l2arc_read_callback_t *cb = zio->io_private; arc_buf_hdr_t *hdr; kmutex_t *hash_lock; boolean_t valid_cksum; boolean_t using_rdata = (BP_IS_ENCRYPTED(&cb->l2rcb_bp) && (cb->l2rcb_flags & ZIO_FLAG_RAW_ENCRYPT)); ASSERT3P(zio->io_vd, !=, NULL); ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE); spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd); ASSERT3P(cb, !=, NULL); hdr = cb->l2rcb_hdr; ASSERT3P(hdr, !=, NULL); hash_lock = HDR_LOCK(hdr); mutex_enter(hash_lock); ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); /* * If the data was read into a temporary buffer, * move it and free the buffer. */ if (cb->l2rcb_abd != NULL) { ASSERT3U(arc_hdr_size(hdr), <, zio->io_size); if (zio->io_error == 0) { if (using_rdata) { abd_copy(hdr->b_crypt_hdr.b_rabd, cb->l2rcb_abd, arc_hdr_size(hdr)); } else { abd_copy(hdr->b_l1hdr.b_pabd, cb->l2rcb_abd, arc_hdr_size(hdr)); } } /* * The following must be done regardless of whether * there was an error: * - free the temporary buffer * - point zio to the real ARC buffer * - set zio size accordingly * These are required because zio is either re-used for * an I/O of the block in the case of the error * or the zio is passed to arc_read_done() and it * needs real data. */ abd_free(cb->l2rcb_abd); zio->io_size = zio->io_orig_size = arc_hdr_size(hdr); if (using_rdata) { ASSERT(HDR_HAS_RABD(hdr)); zio->io_abd = zio->io_orig_abd = hdr->b_crypt_hdr.b_rabd; } else { ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); zio->io_abd = zio->io_orig_abd = hdr->b_l1hdr.b_pabd; } } ASSERT3P(zio->io_abd, !=, NULL); /* * Check this survived the L2ARC journey. */ ASSERT(zio->io_abd == hdr->b_l1hdr.b_pabd || (HDR_HAS_RABD(hdr) && zio->io_abd == hdr->b_crypt_hdr.b_rabd)); zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */ zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */ zio->io_prop.zp_complevel = hdr->b_complevel; valid_cksum = arc_cksum_is_equal(hdr, zio); /* * b_rabd will always match the data as it exists on disk if it is * being used. Therefore if we are reading into b_rabd we do not * attempt to untransform the data. */ if (valid_cksum && !using_rdata) tfm_error = l2arc_untransform(zio, cb); if (valid_cksum && tfm_error == 0 && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) { mutex_exit(hash_lock); zio->io_private = hdr; arc_read_done(zio); } else { /* * Buffer didn't survive caching. Increment stats and * reissue to the original storage device. */ if (zio->io_error != 0) { ARCSTAT_BUMP(arcstat_l2_io_error); } else { zio->io_error = SET_ERROR(EIO); } if (!valid_cksum || tfm_error != 0) ARCSTAT_BUMP(arcstat_l2_cksum_bad); /* * If there's no waiter, issue an async i/o to the primary * storage now. If there *is* a waiter, the caller must * issue the i/o in a context where it's OK to block. */ if (zio->io_waiter == NULL) { zio_t *pio = zio_unique_parent(zio); void *abd = (using_rdata) ? hdr->b_crypt_hdr.b_rabd : hdr->b_l1hdr.b_pabd; ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL); zio = zio_read(pio, zio->io_spa, zio->io_bp, abd, zio->io_size, arc_read_done, hdr, zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb); /* * Original ZIO will be freed, so we need to update * ARC header with the new ZIO pointer to be used * by zio_change_priority() in arc_read(). */ for (struct arc_callback *acb = hdr->b_l1hdr.b_acb; acb != NULL; acb = acb->acb_next) acb->acb_zio_head = zio; mutex_exit(hash_lock); zio_nowait(zio); } else { mutex_exit(hash_lock); } } kmem_free(cb, sizeof (l2arc_read_callback_t)); } /* * This is the list priority from which the L2ARC will search for pages to * cache. This is used within loops (0..3) to cycle through lists in the * desired order. This order can have a significant effect on cache * performance. * * Currently the metadata lists are hit first, MFU then MRU, followed by * the data lists. This function returns a locked list, and also returns * the lock pointer. */ static multilist_sublist_t * l2arc_sublist_lock(int list_num) { multilist_t *ml = NULL; unsigned int idx; ASSERT(list_num >= 0 && list_num < L2ARC_FEED_TYPES); switch (list_num) { case 0: ml = &arc_mfu->arcs_list[ARC_BUFC_METADATA]; break; case 1: ml = &arc_mru->arcs_list[ARC_BUFC_METADATA]; break; case 2: ml = &arc_mfu->arcs_list[ARC_BUFC_DATA]; break; case 3: ml = &arc_mru->arcs_list[ARC_BUFC_DATA]; break; default: return (NULL); } /* * Return a randomly-selected sublist. This is acceptable * because the caller feeds only a little bit of data for each * call (8MB). Subsequent calls will result in different * sublists being selected. */ idx = multilist_get_random_index(ml); return (multilist_sublist_lock(ml, idx)); } /* * Calculates the maximum overhead of L2ARC metadata log blocks for a given * L2ARC write size. l2arc_evict and l2arc_write_size need to include this * overhead in processing to make sure there is enough headroom available * when writing buffers. */ static inline uint64_t l2arc_log_blk_overhead(uint64_t write_sz, l2arc_dev_t *dev) { if (dev->l2ad_log_entries == 0) { return (0); } else { uint64_t log_entries = write_sz >> SPA_MINBLOCKSHIFT; uint64_t log_blocks = (log_entries + dev->l2ad_log_entries - 1) / dev->l2ad_log_entries; return (vdev_psize_to_asize(dev->l2ad_vdev, sizeof (l2arc_log_blk_phys_t)) * log_blocks); } } /* * Evict buffers from the device write hand to the distance specified in * bytes. This distance may span populated buffers, it may span nothing. * This is clearing a region on the L2ARC device ready for writing. * If the 'all' boolean is set, every buffer is evicted. */ static void l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) { list_t *buflist; arc_buf_hdr_t *hdr, *hdr_prev; kmutex_t *hash_lock; uint64_t taddr; l2arc_lb_ptr_buf_t *lb_ptr_buf, *lb_ptr_buf_prev; vdev_t *vd = dev->l2ad_vdev; boolean_t rerun; buflist = &dev->l2ad_buflist; /* * We need to add in the worst case scenario of log block overhead. */ distance += l2arc_log_blk_overhead(distance, dev); if (vd->vdev_has_trim && l2arc_trim_ahead > 0) { /* * Trim ahead of the write size 64MB or (l2arc_trim_ahead/100) * times the write size, whichever is greater. */ distance += MAX(64 * 1024 * 1024, (distance * l2arc_trim_ahead) / 100); } top: rerun = B_FALSE; if (dev->l2ad_hand >= (dev->l2ad_end - distance)) { /* * When there is no space to accommodate upcoming writes, * evict to the end. Then bump the write and evict hands * to the start and iterate. This iteration does not * happen indefinitely as we make sure in * l2arc_write_size() that when the write hand is reset, * the write size does not exceed the end of the device. */ rerun = B_TRUE; taddr = dev->l2ad_end; } else { taddr = dev->l2ad_hand + distance; } DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist, uint64_t, taddr, boolean_t, all); if (!all) { /* * This check has to be placed after deciding whether to * iterate (rerun). */ if (dev->l2ad_first) { /* * This is the first sweep through the device. There is * nothing to evict. We have already trimmmed the * whole device. */ goto out; } else { /* * Trim the space to be evicted. */ if (vd->vdev_has_trim && dev->l2ad_evict < taddr && l2arc_trim_ahead > 0) { /* * We have to drop the spa_config lock because * vdev_trim_range() will acquire it. * l2ad_evict already accounts for the label * size. To prevent vdev_trim_ranges() from * adding it again, we subtract it from * l2ad_evict. */ spa_config_exit(dev->l2ad_spa, SCL_L2ARC, dev); vdev_trim_simple(vd, dev->l2ad_evict - VDEV_LABEL_START_SIZE, taddr - dev->l2ad_evict); spa_config_enter(dev->l2ad_spa, SCL_L2ARC, dev, RW_READER); } /* * When rebuilding L2ARC we retrieve the evict hand * from the header of the device. Of note, l2arc_evict() * does not actually delete buffers from the cache * device, but trimming may do so depending on the * hardware implementation. Thus keeping track of the * evict hand is useful. */ dev->l2ad_evict = MAX(dev->l2ad_evict, taddr); } } retry: mutex_enter(&dev->l2ad_mtx); /* * We have to account for evicted log blocks. Run vdev_space_update() * on log blocks whose offset (in bytes) is before the evicted offset * (in bytes) by searching in the list of pointers to log blocks * present in the L2ARC device. */ for (lb_ptr_buf = list_tail(&dev->l2ad_lbptr_list); lb_ptr_buf; lb_ptr_buf = lb_ptr_buf_prev) { lb_ptr_buf_prev = list_prev(&dev->l2ad_lbptr_list, lb_ptr_buf); /* L2BLK_GET_PSIZE returns aligned size for log blocks */ uint64_t asize = L2BLK_GET_PSIZE( (lb_ptr_buf->lb_ptr)->lbp_prop); /* * We don't worry about log blocks left behind (ie * lbp_payload_start < l2ad_hand) because l2arc_write_buffers() * will never write more than l2arc_evict() evicts. */ if (!all && l2arc_log_blkptr_valid(dev, lb_ptr_buf->lb_ptr)) { break; } else { vdev_space_update(vd, -asize, 0, 0); ARCSTAT_INCR(arcstat_l2_log_blk_asize, -asize); ARCSTAT_BUMPDOWN(arcstat_l2_log_blk_count); zfs_refcount_remove_many(&dev->l2ad_lb_asize, asize, lb_ptr_buf); zfs_refcount_remove(&dev->l2ad_lb_count, lb_ptr_buf); list_remove(&dev->l2ad_lbptr_list, lb_ptr_buf); kmem_free(lb_ptr_buf->lb_ptr, sizeof (l2arc_log_blkptr_t)); kmem_free(lb_ptr_buf, sizeof (l2arc_lb_ptr_buf_t)); } } for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) { hdr_prev = list_prev(buflist, hdr); ASSERT(!HDR_EMPTY(hdr)); hash_lock = HDR_LOCK(hdr); /* * We cannot use mutex_enter or else we can deadlock * with l2arc_write_buffers (due to swapping the order * the hash lock and l2ad_mtx are taken). */ if (!mutex_tryenter(hash_lock)) { /* * Missed the hash lock. Retry. */ ARCSTAT_BUMP(arcstat_l2_evict_lock_retry); mutex_exit(&dev->l2ad_mtx); mutex_enter(hash_lock); mutex_exit(hash_lock); goto retry; } /* * A header can't be on this list if it doesn't have L2 header. */ ASSERT(HDR_HAS_L2HDR(hdr)); /* Ensure this header has finished being written. */ ASSERT(!HDR_L2_WRITING(hdr)); ASSERT(!HDR_L2_WRITE_HEAD(hdr)); if (!all && (hdr->b_l2hdr.b_daddr >= dev->l2ad_evict || hdr->b_l2hdr.b_daddr < dev->l2ad_hand)) { /* * We've evicted to the target address, * or the end of the device. */ mutex_exit(hash_lock); break; } if (!HDR_HAS_L1HDR(hdr)) { ASSERT(!HDR_L2_READING(hdr)); /* * This doesn't exist in the ARC. Destroy. * arc_hdr_destroy() will call list_remove() * and decrement arcstat_l2_lsize. */ arc_change_state(arc_anon, hdr, hash_lock); arc_hdr_destroy(hdr); } else { ASSERT(hdr->b_l1hdr.b_state != arc_l2c_only); ARCSTAT_BUMP(arcstat_l2_evict_l1cached); /* * Invalidate issued or about to be issued * reads, since we may be about to write * over this location. */ if (HDR_L2_READING(hdr)) { ARCSTAT_BUMP(arcstat_l2_evict_reading); arc_hdr_set_flags(hdr, ARC_FLAG_L2_EVICTED); } arc_hdr_l2hdr_destroy(hdr); } mutex_exit(hash_lock); } mutex_exit(&dev->l2ad_mtx); out: /* * We need to check if we evict all buffers, otherwise we may iterate * unnecessarily. */ if (!all && rerun) { /* * Bump device hand to the device start if it is approaching the * end. l2arc_evict() has already evicted ahead for this case. */ dev->l2ad_hand = dev->l2ad_start; dev->l2ad_evict = dev->l2ad_start; dev->l2ad_first = B_FALSE; goto top; } if (!all) { /* * In case of cache device removal (all) the following * assertions may be violated without functional consequences * as the device is about to be removed. */ ASSERT3U(dev->l2ad_hand + distance, <, dev->l2ad_end); if (!dev->l2ad_first) ASSERT3U(dev->l2ad_hand, <, dev->l2ad_evict); } } /* * Handle any abd transforms that might be required for writing to the L2ARC. * If successful, this function will always return an abd with the data * transformed as it is on disk in a new abd of asize bytes. */ static int l2arc_apply_transforms(spa_t *spa, arc_buf_hdr_t *hdr, uint64_t asize, abd_t **abd_out) { int ret; void *tmp = NULL; abd_t *cabd = NULL, *eabd = NULL, *to_write = hdr->b_l1hdr.b_pabd; enum zio_compress compress = HDR_GET_COMPRESS(hdr); uint64_t psize = HDR_GET_PSIZE(hdr); uint64_t size = arc_hdr_size(hdr); boolean_t ismd = HDR_ISTYPE_METADATA(hdr); boolean_t bswap = (hdr->b_l1hdr.b_byteswap != DMU_BSWAP_NUMFUNCS); dsl_crypto_key_t *dck = NULL; uint8_t mac[ZIO_DATA_MAC_LEN] = { 0 }; boolean_t no_crypt = B_FALSE; ASSERT((HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && !HDR_COMPRESSION_ENABLED(hdr)) || HDR_ENCRYPTED(hdr) || HDR_SHARED_DATA(hdr) || psize != asize); ASSERT3U(psize, <=, asize); /* * If this data simply needs its own buffer, we simply allocate it * and copy the data. This may be done to eliminate a dependency on a * shared buffer or to reallocate the buffer to match asize. */ if (HDR_HAS_RABD(hdr) && asize != psize) { ASSERT3U(asize, >=, psize); to_write = abd_alloc_for_io(asize, ismd); abd_copy(to_write, hdr->b_crypt_hdr.b_rabd, psize); if (psize != asize) abd_zero_off(to_write, psize, asize - psize); goto out; } if ((compress == ZIO_COMPRESS_OFF || HDR_COMPRESSION_ENABLED(hdr)) && !HDR_ENCRYPTED(hdr)) { ASSERT3U(size, ==, psize); to_write = abd_alloc_for_io(asize, ismd); abd_copy(to_write, hdr->b_l1hdr.b_pabd, size); if (size != asize) abd_zero_off(to_write, size, asize - size); goto out; } if (compress != ZIO_COMPRESS_OFF && !HDR_COMPRESSION_ENABLED(hdr)) { /* * In some cases, we can wind up with size > asize, so * we need to opt for the larger allocation option here. * * (We also need abd_return_buf_copy in all cases because * it's an ASSERT() to modify the buffer before returning it * with arc_return_buf(), and all the compressors * write things before deciding to fail compression in nearly * every case.) */ cabd = abd_alloc_for_io(size, ismd); tmp = abd_borrow_buf(cabd, size); psize = zio_compress_data(compress, to_write, tmp, size, hdr->b_complevel); if (psize >= asize) { psize = HDR_GET_PSIZE(hdr); abd_return_buf_copy(cabd, tmp, size); HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_OFF); to_write = cabd; abd_copy(to_write, hdr->b_l1hdr.b_pabd, psize); if (psize != asize) abd_zero_off(to_write, psize, asize - psize); goto encrypt; } ASSERT3U(psize, <=, HDR_GET_PSIZE(hdr)); if (psize < asize) memset((char *)tmp + psize, 0, asize - psize); psize = HDR_GET_PSIZE(hdr); abd_return_buf_copy(cabd, tmp, size); to_write = cabd; } encrypt: if (HDR_ENCRYPTED(hdr)) { eabd = abd_alloc_for_io(asize, ismd); /* * If the dataset was disowned before the buffer * made it to this point, the key to re-encrypt * it won't be available. In this case we simply * won't write the buffer to the L2ARC. */ ret = spa_keystore_lookup_key(spa, hdr->b_crypt_hdr.b_dsobj, FTAG, &dck); if (ret != 0) goto error; ret = zio_do_crypt_abd(B_TRUE, &dck->dck_key, hdr->b_crypt_hdr.b_ot, bswap, hdr->b_crypt_hdr.b_salt, hdr->b_crypt_hdr.b_iv, mac, psize, to_write, eabd, &no_crypt); if (ret != 0) goto error; if (no_crypt) abd_copy(eabd, to_write, psize); if (psize != asize) abd_zero_off(eabd, psize, asize - psize); /* assert that the MAC we got here matches the one we saved */ ASSERT0(memcmp(mac, hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN)); spa_keystore_dsl_key_rele(spa, dck, FTAG); if (to_write == cabd) abd_free(cabd); to_write = eabd; } out: ASSERT3P(to_write, !=, hdr->b_l1hdr.b_pabd); *abd_out = to_write; return (0); error: if (dck != NULL) spa_keystore_dsl_key_rele(spa, dck, FTAG); if (cabd != NULL) abd_free(cabd); if (eabd != NULL) abd_free(eabd); *abd_out = NULL; return (ret); } static void l2arc_blk_fetch_done(zio_t *zio) { l2arc_read_callback_t *cb; cb = zio->io_private; if (cb->l2rcb_abd != NULL) abd_free(cb->l2rcb_abd); kmem_free(cb, sizeof (l2arc_read_callback_t)); } /* * Find and write ARC buffers to the L2ARC device. * * An ARC_FLAG_L2_WRITING flag is set so that the L2ARC buffers are not valid * for reading until they have completed writing. * The headroom_boost is an in-out parameter used to maintain headroom boost * state between calls to this function. * * Returns the number of bytes actually written (which may be smaller than * the delta by which the device hand has changed due to alignment and the * writing of log blocks). */ static uint64_t l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) { arc_buf_hdr_t *hdr, *hdr_prev, *head; uint64_t write_asize, write_psize, write_lsize, headroom; boolean_t full; l2arc_write_callback_t *cb = NULL; zio_t *pio, *wzio; uint64_t guid = spa_load_guid(spa); l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr; ASSERT3P(dev->l2ad_vdev, !=, NULL); pio = NULL; write_lsize = write_asize = write_psize = 0; full = B_FALSE; head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE); arc_hdr_set_flags(head, ARC_FLAG_L2_WRITE_HEAD | ARC_FLAG_HAS_L2HDR); /* * Copy buffers for L2ARC writing. */ for (int pass = 0; pass < L2ARC_FEED_TYPES; pass++) { /* * If pass == 1 or 3, we cache MRU metadata and data * respectively. */ if (l2arc_mfuonly) { if (pass == 1 || pass == 3) continue; } multilist_sublist_t *mls = l2arc_sublist_lock(pass); uint64_t passed_sz = 0; VERIFY3P(mls, !=, NULL); /* * L2ARC fast warmup. * * Until the ARC is warm and starts to evict, read from the * head of the ARC lists rather than the tail. */ if (arc_warm == B_FALSE) hdr = multilist_sublist_head(mls); else hdr = multilist_sublist_tail(mls); headroom = target_sz * l2arc_headroom; if (zfs_compressed_arc_enabled) headroom = (headroom * l2arc_headroom_boost) / 100; for (; hdr; hdr = hdr_prev) { kmutex_t *hash_lock; abd_t *to_write = NULL; if (arc_warm == B_FALSE) hdr_prev = multilist_sublist_next(mls, hdr); else hdr_prev = multilist_sublist_prev(mls, hdr); hash_lock = HDR_LOCK(hdr); if (!mutex_tryenter(hash_lock)) { /* * Skip this buffer rather than waiting. */ continue; } passed_sz += HDR_GET_LSIZE(hdr); if (l2arc_headroom != 0 && passed_sz > headroom) { /* * Searched too far. */ mutex_exit(hash_lock); break; } if (!l2arc_write_eligible(guid, hdr)) { mutex_exit(hash_lock); continue; } ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT3U(HDR_GET_PSIZE(hdr), >, 0); ASSERT3U(arc_hdr_size(hdr), >, 0); ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr)); uint64_t psize = HDR_GET_PSIZE(hdr); uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev, psize); if ((write_asize + asize) > target_sz) { full = B_TRUE; mutex_exit(hash_lock); break; } /* * We rely on the L1 portion of the header below, so * it's invalid for this header to have been evicted out * of the ghost cache, prior to being written out. The * ARC_FLAG_L2_WRITING bit ensures this won't happen. */ arc_hdr_set_flags(hdr, ARC_FLAG_L2_WRITING); /* * If this header has b_rabd, we can use this since it * must always match the data exactly as it exists on * disk. Otherwise, the L2ARC can normally use the * hdr's data, but if we're sharing data between the * hdr and one of its bufs, L2ARC needs its own copy of * the data so that the ZIO below can't race with the * buf consumer. To ensure that this copy will be * available for the lifetime of the ZIO and be cleaned * up afterwards, we add it to the l2arc_free_on_write * queue. If we need to apply any transforms to the * data (compression, encryption) we will also need the * extra buffer. */ if (HDR_HAS_RABD(hdr) && psize == asize) { to_write = hdr->b_crypt_hdr.b_rabd; } else if ((HDR_COMPRESSION_ENABLED(hdr) || HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF) && !HDR_ENCRYPTED(hdr) && !HDR_SHARED_DATA(hdr) && psize == asize) { to_write = hdr->b_l1hdr.b_pabd; } else { int ret; arc_buf_contents_t type = arc_buf_type(hdr); ret = l2arc_apply_transforms(spa, hdr, asize, &to_write); if (ret != 0) { arc_hdr_clear_flags(hdr, ARC_FLAG_L2_WRITING); mutex_exit(hash_lock); continue; } l2arc_free_abd_on_write(to_write, asize, type); } if (pio == NULL) { /* * Insert a dummy header on the buflist so * l2arc_write_done() can find where the * write buffers begin without searching. */ mutex_enter(&dev->l2ad_mtx); list_insert_head(&dev->l2ad_buflist, head); mutex_exit(&dev->l2ad_mtx); cb = kmem_alloc( sizeof (l2arc_write_callback_t), KM_SLEEP); cb->l2wcb_dev = dev; cb->l2wcb_head = head; /* * Create a list to save allocated abd buffers * for l2arc_log_blk_commit(). */ list_create(&cb->l2wcb_abd_list, sizeof (l2arc_lb_abd_buf_t), offsetof(l2arc_lb_abd_buf_t, node)); pio = zio_root(spa, l2arc_write_done, cb, ZIO_FLAG_CANFAIL); } hdr->b_l2hdr.b_dev = dev; hdr->b_l2hdr.b_hits = 0; hdr->b_l2hdr.b_daddr = dev->l2ad_hand; hdr->b_l2hdr.b_arcs_state = hdr->b_l1hdr.b_state->arcs_state; arc_hdr_set_flags(hdr, ARC_FLAG_HAS_L2HDR); mutex_enter(&dev->l2ad_mtx); list_insert_head(&dev->l2ad_buflist, hdr); mutex_exit(&dev->l2ad_mtx); (void) zfs_refcount_add_many(&dev->l2ad_alloc, arc_hdr_size(hdr), hdr); wzio = zio_write_phys(pio, dev->l2ad_vdev, hdr->b_l2hdr.b_daddr, asize, to_write, ZIO_CHECKSUM_OFF, NULL, hdr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE); write_lsize += HDR_GET_LSIZE(hdr); DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, zio_t *, wzio); write_psize += psize; write_asize += asize; dev->l2ad_hand += asize; l2arc_hdr_arcstats_increment(hdr); vdev_space_update(dev->l2ad_vdev, asize, 0, 0); mutex_exit(hash_lock); /* * Append buf info to current log and commit if full. * arcstat_l2_{size,asize} kstats are updated * internally. */ if (l2arc_log_blk_insert(dev, hdr)) l2arc_log_blk_commit(dev, pio, cb); zio_nowait(wzio); } multilist_sublist_unlock(mls); if (full == B_TRUE) break; } /* No buffers selected for writing? */ if (pio == NULL) { ASSERT0(write_lsize); ASSERT(!HDR_HAS_L1HDR(head)); kmem_cache_free(hdr_l2only_cache, head); /* * Although we did not write any buffers l2ad_evict may * have advanced. */ if (dev->l2ad_evict != l2dhdr->dh_evict) l2arc_dev_hdr_update(dev); return (0); } if (!dev->l2ad_first) ASSERT3U(dev->l2ad_hand, <=, dev->l2ad_evict); ASSERT3U(write_asize, <=, target_sz); ARCSTAT_BUMP(arcstat_l2_writes_sent); ARCSTAT_INCR(arcstat_l2_write_bytes, write_psize); dev->l2ad_writing = B_TRUE; (void) zio_wait(pio); dev->l2ad_writing = B_FALSE; /* * Update the device header after the zio completes as * l2arc_write_done() may have updated the memory holding the log block * pointers in the device header. */ l2arc_dev_hdr_update(dev); return (write_asize); } static boolean_t l2arc_hdr_limit_reached(void) { int64_t s = aggsum_upper_bound(&arc_sums.arcstat_l2_hdr_size); return (arc_reclaim_needed() || (s > arc_meta_limit * 3 / 4) || (s > (arc_warm ? arc_c : arc_c_max) * l2arc_meta_percent / 100)); } /* * This thread feeds the L2ARC at regular intervals. This is the beating * heart of the L2ARC. */ static __attribute__((noreturn)) void l2arc_feed_thread(void *unused) { (void) unused; callb_cpr_t cpr; l2arc_dev_t *dev; spa_t *spa; uint64_t size, wrote; clock_t begin, next = ddi_get_lbolt(); fstrans_cookie_t cookie; CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG); mutex_enter(&l2arc_feed_thr_lock); cookie = spl_fstrans_mark(); while (l2arc_thread_exit == 0) { CALLB_CPR_SAFE_BEGIN(&cpr); (void) cv_timedwait_idle(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock, next); CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock); next = ddi_get_lbolt() + hz; /* * Quick check for L2ARC devices. */ mutex_enter(&l2arc_dev_mtx); if (l2arc_ndev == 0) { mutex_exit(&l2arc_dev_mtx); continue; } mutex_exit(&l2arc_dev_mtx); begin = ddi_get_lbolt(); /* * This selects the next l2arc device to write to, and in * doing so the next spa to feed from: dev->l2ad_spa. This * will return NULL if there are now no l2arc devices or if * they are all faulted. * * If a device is returned, its spa's config lock is also * held to prevent device removal. l2arc_dev_get_next() * will grab and release l2arc_dev_mtx. */ if ((dev = l2arc_dev_get_next()) == NULL) continue; spa = dev->l2ad_spa; ASSERT3P(spa, !=, NULL); /* * If the pool is read-only then force the feed thread to * sleep a little longer. */ if (!spa_writeable(spa)) { next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz; spa_config_exit(spa, SCL_L2ARC, dev); continue; } /* * Avoid contributing to memory pressure. */ if (l2arc_hdr_limit_reached()) { ARCSTAT_BUMP(arcstat_l2_abort_lowmem); spa_config_exit(spa, SCL_L2ARC, dev); continue; } ARCSTAT_BUMP(arcstat_l2_feeds); size = l2arc_write_size(dev); /* * Evict L2ARC buffers that will be overwritten. */ l2arc_evict(dev, size, B_FALSE); /* * Write ARC buffers. */ wrote = l2arc_write_buffers(spa, dev, size); /* * Calculate interval between writes. */ next = l2arc_write_interval(begin, size, wrote); spa_config_exit(spa, SCL_L2ARC, dev); } spl_fstrans_unmark(cookie); l2arc_thread_exit = 0; cv_broadcast(&l2arc_feed_thr_cv); CALLB_CPR_EXIT(&cpr); /* drops l2arc_feed_thr_lock */ thread_exit(); } boolean_t l2arc_vdev_present(vdev_t *vd) { return (l2arc_vdev_get(vd) != NULL); } /* * Returns the l2arc_dev_t associated with a particular vdev_t or NULL if * the vdev_t isn't an L2ARC device. */ l2arc_dev_t * l2arc_vdev_get(vdev_t *vd) { l2arc_dev_t *dev; mutex_enter(&l2arc_dev_mtx); for (dev = list_head(l2arc_dev_list); dev != NULL; dev = list_next(l2arc_dev_list, dev)) { if (dev->l2ad_vdev == vd) break; } mutex_exit(&l2arc_dev_mtx); return (dev); } static void l2arc_rebuild_dev(l2arc_dev_t *dev, boolean_t reopen) { l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr; uint64_t l2dhdr_asize = dev->l2ad_dev_hdr_asize; spa_t *spa = dev->l2ad_spa; /* * The L2ARC has to hold at least the payload of one log block for * them to be restored (persistent L2ARC). The payload of a log block * depends on the amount of its log entries. We always write log blocks * with 1022 entries. How many of them are committed or restored depends * on the size of the L2ARC device. Thus the maximum payload of * one log block is 1022 * SPA_MAXBLOCKSIZE = 16GB. If the L2ARC device * is less than that, we reduce the amount of committed and restored * log entries per block so as to enable persistence. */ if (dev->l2ad_end < l2arc_rebuild_blocks_min_l2size) { dev->l2ad_log_entries = 0; } else { dev->l2ad_log_entries = MIN((dev->l2ad_end - dev->l2ad_start) >> SPA_MAXBLOCKSHIFT, L2ARC_LOG_BLK_MAX_ENTRIES); } /* * Read the device header, if an error is returned do not rebuild L2ARC. */ if (l2arc_dev_hdr_read(dev) == 0 && dev->l2ad_log_entries > 0) { /* * If we are onlining a cache device (vdev_reopen) that was * still present (l2arc_vdev_present()) and rebuild is enabled, * we should evict all ARC buffers and pointers to log blocks * and reclaim their space before restoring its contents to * L2ARC. */ if (reopen) { if (!l2arc_rebuild_enabled) { return; } else { l2arc_evict(dev, 0, B_TRUE); /* start a new log block */ dev->l2ad_log_ent_idx = 0; dev->l2ad_log_blk_payload_asize = 0; dev->l2ad_log_blk_payload_start = 0; } } /* * Just mark the device as pending for a rebuild. We won't * be starting a rebuild in line here as it would block pool * import. Instead spa_load_impl will hand that off to an * async task which will call l2arc_spa_rebuild_start. */ dev->l2ad_rebuild = B_TRUE; } else if (spa_writeable(spa)) { /* * In this case TRIM the whole device if l2arc_trim_ahead > 0, * otherwise create a new header. We zero out the memory holding * the header to reset dh_start_lbps. If we TRIM the whole * device the new header will be written by * vdev_trim_l2arc_thread() at the end of the TRIM to update the * trim_state in the header too. When reading the header, if * trim_state is not VDEV_TRIM_COMPLETE and l2arc_trim_ahead > 0 * we opt to TRIM the whole device again. */ if (l2arc_trim_ahead > 0) { dev->l2ad_trim_all = B_TRUE; } else { memset(l2dhdr, 0, l2dhdr_asize); l2arc_dev_hdr_update(dev); } } } /* * Add a vdev for use by the L2ARC. By this point the spa has already * validated the vdev and opened it. */ void l2arc_add_vdev(spa_t *spa, vdev_t *vd) { l2arc_dev_t *adddev; uint64_t l2dhdr_asize; ASSERT(!l2arc_vdev_present(vd)); /* * Create a new l2arc device entry. */ adddev = vmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP); adddev->l2ad_spa = spa; adddev->l2ad_vdev = vd; /* leave extra size for an l2arc device header */ l2dhdr_asize = adddev->l2ad_dev_hdr_asize = MAX(sizeof (*adddev->l2ad_dev_hdr), 1 << vd->vdev_ashift); adddev->l2ad_start = VDEV_LABEL_START_SIZE + l2dhdr_asize; adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd); ASSERT3U(adddev->l2ad_start, <, adddev->l2ad_end); adddev->l2ad_hand = adddev->l2ad_start; adddev->l2ad_evict = adddev->l2ad_start; adddev->l2ad_first = B_TRUE; adddev->l2ad_writing = B_FALSE; adddev->l2ad_trim_all = B_FALSE; list_link_init(&adddev->l2ad_node); adddev->l2ad_dev_hdr = kmem_zalloc(l2dhdr_asize, KM_SLEEP); mutex_init(&adddev->l2ad_mtx, NULL, MUTEX_DEFAULT, NULL); /* * This is a list of all ARC buffers that are still valid on the * device. */ list_create(&adddev->l2ad_buflist, sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l2hdr.b_l2node)); /* * This is a list of pointers to log blocks that are still present * on the device. */ list_create(&adddev->l2ad_lbptr_list, sizeof (l2arc_lb_ptr_buf_t), offsetof(l2arc_lb_ptr_buf_t, node)); vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand); zfs_refcount_create(&adddev->l2ad_alloc); zfs_refcount_create(&adddev->l2ad_lb_asize); zfs_refcount_create(&adddev->l2ad_lb_count); /* * Decide if dev is eligible for L2ARC rebuild or whole device * trimming. This has to happen before the device is added in the * cache device list and l2arc_dev_mtx is released. Otherwise * l2arc_feed_thread() might already start writing on the * device. */ l2arc_rebuild_dev(adddev, B_FALSE); /* * Add device to global list */ mutex_enter(&l2arc_dev_mtx); list_insert_head(l2arc_dev_list, adddev); atomic_inc_64(&l2arc_ndev); mutex_exit(&l2arc_dev_mtx); } /* * Decide if a vdev is eligible for L2ARC rebuild, called from vdev_reopen() * in case of onlining a cache device. */ void l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen) { l2arc_dev_t *dev = NULL; dev = l2arc_vdev_get(vd); ASSERT3P(dev, !=, NULL); /* * In contrast to l2arc_add_vdev() we do not have to worry about * l2arc_feed_thread() invalidating previous content when onlining a * cache device. The device parameters (l2ad*) are not cleared when * offlining the device and writing new buffers will not invalidate * all previous content. In worst case only buffers that have not had * their log block written to the device will be lost. * When onlining the cache device (ie offline->online without exporting * the pool in between) this happens: * vdev_reopen() -> vdev_open() -> l2arc_rebuild_vdev() * | | * vdev_is_dead() = B_FALSE l2ad_rebuild = B_TRUE * During the time where vdev_is_dead = B_FALSE and until l2ad_rebuild * is set to B_TRUE we might write additional buffers to the device. */ l2arc_rebuild_dev(dev, reopen); } /* * Remove a vdev from the L2ARC. */ void l2arc_remove_vdev(vdev_t *vd) { l2arc_dev_t *remdev = NULL; /* * Find the device by vdev */ remdev = l2arc_vdev_get(vd); ASSERT3P(remdev, !=, NULL); /* * Cancel any ongoing or scheduled rebuild. */ mutex_enter(&l2arc_rebuild_thr_lock); if (remdev->l2ad_rebuild_began == B_TRUE) { remdev->l2ad_rebuild_cancel = B_TRUE; while (remdev->l2ad_rebuild == B_TRUE) cv_wait(&l2arc_rebuild_thr_cv, &l2arc_rebuild_thr_lock); } mutex_exit(&l2arc_rebuild_thr_lock); /* * Remove device from global list */ mutex_enter(&l2arc_dev_mtx); list_remove(l2arc_dev_list, remdev); l2arc_dev_last = NULL; /* may have been invalidated */ atomic_dec_64(&l2arc_ndev); mutex_exit(&l2arc_dev_mtx); /* * Clear all buflists and ARC references. L2ARC device flush. */ l2arc_evict(remdev, 0, B_TRUE); list_destroy(&remdev->l2ad_buflist); ASSERT(list_is_empty(&remdev->l2ad_lbptr_list)); list_destroy(&remdev->l2ad_lbptr_list); mutex_destroy(&remdev->l2ad_mtx); zfs_refcount_destroy(&remdev->l2ad_alloc); zfs_refcount_destroy(&remdev->l2ad_lb_asize); zfs_refcount_destroy(&remdev->l2ad_lb_count); kmem_free(remdev->l2ad_dev_hdr, remdev->l2ad_dev_hdr_asize); vmem_free(remdev, sizeof (l2arc_dev_t)); } void l2arc_init(void) { l2arc_thread_exit = 0; l2arc_ndev = 0; mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL); mutex_init(&l2arc_rebuild_thr_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&l2arc_rebuild_thr_cv, NULL, CV_DEFAULT, NULL); mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL); mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL); l2arc_dev_list = &L2ARC_dev_list; l2arc_free_on_write = &L2ARC_free_on_write; list_create(l2arc_dev_list, sizeof (l2arc_dev_t), offsetof(l2arc_dev_t, l2ad_node)); list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t), offsetof(l2arc_data_free_t, l2df_list_node)); } void l2arc_fini(void) { mutex_destroy(&l2arc_feed_thr_lock); cv_destroy(&l2arc_feed_thr_cv); mutex_destroy(&l2arc_rebuild_thr_lock); cv_destroy(&l2arc_rebuild_thr_cv); mutex_destroy(&l2arc_dev_mtx); mutex_destroy(&l2arc_free_on_write_mtx); list_destroy(l2arc_dev_list); list_destroy(l2arc_free_on_write); } void l2arc_start(void) { if (!(spa_mode_global & SPA_MODE_WRITE)) return; (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0, TS_RUN, defclsyspri); } void l2arc_stop(void) { if (!(spa_mode_global & SPA_MODE_WRITE)) return; mutex_enter(&l2arc_feed_thr_lock); cv_signal(&l2arc_feed_thr_cv); /* kick thread out of startup */ l2arc_thread_exit = 1; while (l2arc_thread_exit != 0) cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock); mutex_exit(&l2arc_feed_thr_lock); } /* * Punches out rebuild threads for the L2ARC devices in a spa. This should * be called after pool import from the spa async thread, since starting * these threads directly from spa_import() will make them part of the * "zpool import" context and delay process exit (and thus pool import). */ void l2arc_spa_rebuild_start(spa_t *spa) { ASSERT(MUTEX_HELD(&spa_namespace_lock)); /* * Locate the spa's l2arc devices and kick off rebuild threads. */ for (int i = 0; i < spa->spa_l2cache.sav_count; i++) { l2arc_dev_t *dev = l2arc_vdev_get(spa->spa_l2cache.sav_vdevs[i]); if (dev == NULL) { /* Don't attempt a rebuild if the vdev is UNAVAIL */ continue; } mutex_enter(&l2arc_rebuild_thr_lock); if (dev->l2ad_rebuild && !dev->l2ad_rebuild_cancel) { dev->l2ad_rebuild_began = B_TRUE; (void) thread_create(NULL, 0, l2arc_dev_rebuild_thread, dev, 0, &p0, TS_RUN, minclsyspri); } mutex_exit(&l2arc_rebuild_thr_lock); } } /* * Main entry point for L2ARC rebuilding. */ static __attribute__((noreturn)) void l2arc_dev_rebuild_thread(void *arg) { l2arc_dev_t *dev = arg; VERIFY(!dev->l2ad_rebuild_cancel); VERIFY(dev->l2ad_rebuild); (void) l2arc_rebuild(dev); mutex_enter(&l2arc_rebuild_thr_lock); dev->l2ad_rebuild_began = B_FALSE; dev->l2ad_rebuild = B_FALSE; mutex_exit(&l2arc_rebuild_thr_lock); thread_exit(); } /* * This function implements the actual L2ARC metadata rebuild. It: * starts reading the log block chain and restores each block's contents * to memory (reconstructing arc_buf_hdr_t's). * * Operation stops under any of the following conditions: * * 1) We reach the end of the log block chain. * 2) We encounter *any* error condition (cksum errors, io errors) */ static int l2arc_rebuild(l2arc_dev_t *dev) { vdev_t *vd = dev->l2ad_vdev; spa_t *spa = vd->vdev_spa; int err = 0; l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr; l2arc_log_blk_phys_t *this_lb, *next_lb; zio_t *this_io = NULL, *next_io = NULL; l2arc_log_blkptr_t lbps[2]; l2arc_lb_ptr_buf_t *lb_ptr_buf; boolean_t lock_held; this_lb = vmem_zalloc(sizeof (*this_lb), KM_SLEEP); next_lb = vmem_zalloc(sizeof (*next_lb), KM_SLEEP); /* * We prevent device removal while issuing reads to the device, * then during the rebuilding phases we drop this lock again so * that a spa_unload or device remove can be initiated - this is * safe, because the spa will signal us to stop before removing * our device and wait for us to stop. */ spa_config_enter(spa, SCL_L2ARC, vd, RW_READER); lock_held = B_TRUE; /* * Retrieve the persistent L2ARC device state. * L2BLK_GET_PSIZE returns aligned size for log blocks. */ dev->l2ad_evict = MAX(l2dhdr->dh_evict, dev->l2ad_start); dev->l2ad_hand = MAX(l2dhdr->dh_start_lbps[0].lbp_daddr + L2BLK_GET_PSIZE((&l2dhdr->dh_start_lbps[0])->lbp_prop), dev->l2ad_start); dev->l2ad_first = !!(l2dhdr->dh_flags & L2ARC_DEV_HDR_EVICT_FIRST); vd->vdev_trim_action_time = l2dhdr->dh_trim_action_time; vd->vdev_trim_state = l2dhdr->dh_trim_state; /* * In case the zfs module parameter l2arc_rebuild_enabled is false * we do not start the rebuild process. */ if (!l2arc_rebuild_enabled) goto out; /* Prepare the rebuild process */ memcpy(lbps, l2dhdr->dh_start_lbps, sizeof (lbps)); /* Start the rebuild process */ for (;;) { if (!l2arc_log_blkptr_valid(dev, &lbps[0])) break; if ((err = l2arc_log_blk_read(dev, &lbps[0], &lbps[1], this_lb, next_lb, this_io, &next_io)) != 0) goto out; /* * Our memory pressure valve. If the system is running low * on memory, rather than swamping memory with new ARC buf * hdrs, we opt not to rebuild the L2ARC. At this point, * however, we have already set up our L2ARC dev to chain in * new metadata log blocks, so the user may choose to offline/ * online the L2ARC dev at a later time (or re-import the pool) * to reconstruct it (when there's less memory pressure). */ if (l2arc_hdr_limit_reached()) { ARCSTAT_BUMP(arcstat_l2_rebuild_abort_lowmem); cmn_err(CE_NOTE, "System running low on memory, " "aborting L2ARC rebuild."); err = SET_ERROR(ENOMEM); goto out; } spa_config_exit(spa, SCL_L2ARC, vd); lock_held = B_FALSE; /* * Now that we know that the next_lb checks out alright, we * can start reconstruction from this log block. * L2BLK_GET_PSIZE returns aligned size for log blocks. */ uint64_t asize = L2BLK_GET_PSIZE((&lbps[0])->lbp_prop); l2arc_log_blk_restore(dev, this_lb, asize); /* * log block restored, include its pointer in the list of * pointers to log blocks present in the L2ARC device. */ lb_ptr_buf = kmem_zalloc(sizeof (l2arc_lb_ptr_buf_t), KM_SLEEP); lb_ptr_buf->lb_ptr = kmem_zalloc(sizeof (l2arc_log_blkptr_t), KM_SLEEP); memcpy(lb_ptr_buf->lb_ptr, &lbps[0], sizeof (l2arc_log_blkptr_t)); mutex_enter(&dev->l2ad_mtx); list_insert_tail(&dev->l2ad_lbptr_list, lb_ptr_buf); ARCSTAT_INCR(arcstat_l2_log_blk_asize, asize); ARCSTAT_BUMP(arcstat_l2_log_blk_count); zfs_refcount_add_many(&dev->l2ad_lb_asize, asize, lb_ptr_buf); zfs_refcount_add(&dev->l2ad_lb_count, lb_ptr_buf); mutex_exit(&dev->l2ad_mtx); vdev_space_update(vd, asize, 0, 0); /* * Protection against loops of log blocks: * * l2ad_hand l2ad_evict * V V * l2ad_start |=======================================| l2ad_end * -----|||----|||---|||----||| * (3) (2) (1) (0) * ---|||---|||----|||---||| * (7) (6) (5) (4) * * In this situation the pointer of log block (4) passes * l2arc_log_blkptr_valid() but the log block should not be * restored as it is overwritten by the payload of log block * (0). Only log blocks (0)-(3) should be restored. We check * whether l2ad_evict lies in between the payload starting * offset of the next log block (lbps[1].lbp_payload_start) * and the payload starting offset of the present log block * (lbps[0].lbp_payload_start). If true and this isn't the * first pass, we are looping from the beginning and we should * stop. */ if (l2arc_range_check_overlap(lbps[1].lbp_payload_start, lbps[0].lbp_payload_start, dev->l2ad_evict) && !dev->l2ad_first) goto out; kpreempt(KPREEMPT_SYNC); for (;;) { mutex_enter(&l2arc_rebuild_thr_lock); if (dev->l2ad_rebuild_cancel) { dev->l2ad_rebuild = B_FALSE; cv_signal(&l2arc_rebuild_thr_cv); mutex_exit(&l2arc_rebuild_thr_lock); err = SET_ERROR(ECANCELED); goto out; } mutex_exit(&l2arc_rebuild_thr_lock); if (spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER)) { lock_held = B_TRUE; break; } /* * L2ARC config lock held by somebody in writer, * possibly due to them trying to remove us. They'll * likely to want us to shut down, so after a little * delay, we check l2ad_rebuild_cancel and retry * the lock again. */ delay(1); } /* * Continue with the next log block. */ lbps[0] = lbps[1]; lbps[1] = this_lb->lb_prev_lbp; PTR_SWAP(this_lb, next_lb); this_io = next_io; next_io = NULL; } if (this_io != NULL) l2arc_log_blk_fetch_abort(this_io); out: if (next_io != NULL) l2arc_log_blk_fetch_abort(next_io); vmem_free(this_lb, sizeof (*this_lb)); vmem_free(next_lb, sizeof (*next_lb)); if (!l2arc_rebuild_enabled) { spa_history_log_internal(spa, "L2ARC rebuild", NULL, "disabled"); } else if (err == 0 && zfs_refcount_count(&dev->l2ad_lb_count) > 0) { ARCSTAT_BUMP(arcstat_l2_rebuild_success); spa_history_log_internal(spa, "L2ARC rebuild", NULL, "successful, restored %llu blocks", (u_longlong_t)zfs_refcount_count(&dev->l2ad_lb_count)); } else if (err == 0 && zfs_refcount_count(&dev->l2ad_lb_count) == 0) { /* * No error but also nothing restored, meaning the lbps array * in the device header points to invalid/non-present log * blocks. Reset the header. */ spa_history_log_internal(spa, "L2ARC rebuild", NULL, "no valid log blocks"); memset(l2dhdr, 0, dev->l2ad_dev_hdr_asize); l2arc_dev_hdr_update(dev); } else if (err == ECANCELED) { /* * In case the rebuild was canceled do not log to spa history * log as the pool may be in the process of being removed. */ zfs_dbgmsg("L2ARC rebuild aborted, restored %llu blocks", (u_longlong_t)zfs_refcount_count(&dev->l2ad_lb_count)); } else if (err != 0) { spa_history_log_internal(spa, "L2ARC rebuild", NULL, "aborted, restored %llu blocks", (u_longlong_t)zfs_refcount_count(&dev->l2ad_lb_count)); } if (lock_held) spa_config_exit(spa, SCL_L2ARC, vd); return (err); } /* * Attempts to read the device header on the provided L2ARC device and writes * it to `hdr'. On success, this function returns 0, otherwise the appropriate * error code is returned. */ static int l2arc_dev_hdr_read(l2arc_dev_t *dev) { int err; uint64_t guid; l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr; const uint64_t l2dhdr_asize = dev->l2ad_dev_hdr_asize; abd_t *abd; guid = spa_guid(dev->l2ad_vdev->vdev_spa); abd = abd_get_from_buf(l2dhdr, l2dhdr_asize); err = zio_wait(zio_read_phys(NULL, dev->l2ad_vdev, VDEV_LABEL_START_SIZE, l2dhdr_asize, abd, ZIO_CHECKSUM_LABEL, NULL, NULL, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_SPECULATIVE, B_FALSE)); abd_free(abd); if (err != 0) { ARCSTAT_BUMP(arcstat_l2_rebuild_abort_dh_errors); zfs_dbgmsg("L2ARC IO error (%d) while reading device header, " "vdev guid: %llu", err, (u_longlong_t)dev->l2ad_vdev->vdev_guid); return (err); } if (l2dhdr->dh_magic == BSWAP_64(L2ARC_DEV_HDR_MAGIC)) byteswap_uint64_array(l2dhdr, sizeof (*l2dhdr)); if (l2dhdr->dh_magic != L2ARC_DEV_HDR_MAGIC || l2dhdr->dh_spa_guid != guid || l2dhdr->dh_vdev_guid != dev->l2ad_vdev->vdev_guid || l2dhdr->dh_version != L2ARC_PERSISTENT_VERSION || l2dhdr->dh_log_entries != dev->l2ad_log_entries || l2dhdr->dh_end != dev->l2ad_end || !l2arc_range_check_overlap(dev->l2ad_start, dev->l2ad_end, l2dhdr->dh_evict) || (l2dhdr->dh_trim_state != VDEV_TRIM_COMPLETE && l2arc_trim_ahead > 0)) { /* * Attempt to rebuild a device containing no actual dev hdr * or containing a header from some other pool or from another * version of persistent L2ARC. */ ARCSTAT_BUMP(arcstat_l2_rebuild_abort_unsupported); return (SET_ERROR(ENOTSUP)); } return (0); } /* * Reads L2ARC log blocks from storage and validates their contents. * * This function implements a simple fetcher to make sure that while * we're processing one buffer the L2ARC is already fetching the next * one in the chain. * * The arguments this_lp and next_lp point to the current and next log block * address in the block chain. Similarly, this_lb and next_lb hold the * l2arc_log_blk_phys_t's of the current and next L2ARC blk. * * The `this_io' and `next_io' arguments are used for block fetching. * When issuing the first blk IO during rebuild, you should pass NULL for * `this_io'. This function will then issue a sync IO to read the block and * also issue an async IO to fetch the next block in the block chain. The * fetched IO is returned in `next_io'. On subsequent calls to this * function, pass the value returned in `next_io' from the previous call * as `this_io' and a fresh `next_io' pointer to hold the next fetch IO. * Prior to the call, you should initialize your `next_io' pointer to be * NULL. If no fetch IO was issued, the pointer is left set at NULL. * * On success, this function returns 0, otherwise it returns an appropriate * error code. On error the fetching IO is aborted and cleared before * returning from this function. Therefore, if we return `success', the * caller can assume that we have taken care of cleanup of fetch IOs. */ static int l2arc_log_blk_read(l2arc_dev_t *dev, const l2arc_log_blkptr_t *this_lbp, const l2arc_log_blkptr_t *next_lbp, l2arc_log_blk_phys_t *this_lb, l2arc_log_blk_phys_t *next_lb, zio_t *this_io, zio_t **next_io) { int err = 0; zio_cksum_t cksum; abd_t *abd = NULL; uint64_t asize; ASSERT(this_lbp != NULL && next_lbp != NULL); ASSERT(this_lb != NULL && next_lb != NULL); ASSERT(next_io != NULL && *next_io == NULL); ASSERT(l2arc_log_blkptr_valid(dev, this_lbp)); /* * Check to see if we have issued the IO for this log block in a * previous run. If not, this is the first call, so issue it now. */ if (this_io == NULL) { this_io = l2arc_log_blk_fetch(dev->l2ad_vdev, this_lbp, this_lb); } /* * Peek to see if we can start issuing the next IO immediately. */ if (l2arc_log_blkptr_valid(dev, next_lbp)) { /* * Start issuing IO for the next log block early - this * should help keep the L2ARC device busy while we * decompress and restore this log block. */ *next_io = l2arc_log_blk_fetch(dev->l2ad_vdev, next_lbp, next_lb); } /* Wait for the IO to read this log block to complete */ if ((err = zio_wait(this_io)) != 0) { ARCSTAT_BUMP(arcstat_l2_rebuild_abort_io_errors); zfs_dbgmsg("L2ARC IO error (%d) while reading log block, " "offset: %llu, vdev guid: %llu", err, (u_longlong_t)this_lbp->lbp_daddr, (u_longlong_t)dev->l2ad_vdev->vdev_guid); goto cleanup; } /* * Make sure the buffer checks out. * L2BLK_GET_PSIZE returns aligned size for log blocks. */ asize = L2BLK_GET_PSIZE((this_lbp)->lbp_prop); fletcher_4_native(this_lb, asize, NULL, &cksum); if (!ZIO_CHECKSUM_EQUAL(cksum, this_lbp->lbp_cksum)) { ARCSTAT_BUMP(arcstat_l2_rebuild_abort_cksum_lb_errors); zfs_dbgmsg("L2ARC log block cksum failed, offset: %llu, " "vdev guid: %llu, l2ad_hand: %llu, l2ad_evict: %llu", (u_longlong_t)this_lbp->lbp_daddr, (u_longlong_t)dev->l2ad_vdev->vdev_guid, (u_longlong_t)dev->l2ad_hand, (u_longlong_t)dev->l2ad_evict); err = SET_ERROR(ECKSUM); goto cleanup; } /* Now we can take our time decoding this buffer */ switch (L2BLK_GET_COMPRESS((this_lbp)->lbp_prop)) { case ZIO_COMPRESS_OFF: break; case ZIO_COMPRESS_LZ4: abd = abd_alloc_for_io(asize, B_TRUE); abd_copy_from_buf_off(abd, this_lb, 0, asize); if ((err = zio_decompress_data( L2BLK_GET_COMPRESS((this_lbp)->lbp_prop), abd, this_lb, asize, sizeof (*this_lb), NULL)) != 0) { err = SET_ERROR(EINVAL); goto cleanup; } break; default: err = SET_ERROR(EINVAL); goto cleanup; } if (this_lb->lb_magic == BSWAP_64(L2ARC_LOG_BLK_MAGIC)) byteswap_uint64_array(this_lb, sizeof (*this_lb)); if (this_lb->lb_magic != L2ARC_LOG_BLK_MAGIC) { err = SET_ERROR(EINVAL); goto cleanup; } cleanup: /* Abort an in-flight fetch I/O in case of error */ if (err != 0 && *next_io != NULL) { l2arc_log_blk_fetch_abort(*next_io); *next_io = NULL; } if (abd != NULL) abd_free(abd); return (err); } /* * Restores the payload of a log block to ARC. This creates empty ARC hdr * entries which only contain an l2arc hdr, essentially restoring the * buffers to their L2ARC evicted state. This function also updates space * usage on the L2ARC vdev to make sure it tracks restored buffers. */ static void l2arc_log_blk_restore(l2arc_dev_t *dev, const l2arc_log_blk_phys_t *lb, uint64_t lb_asize) { uint64_t size = 0, asize = 0; uint64_t log_entries = dev->l2ad_log_entries; /* * Usually arc_adapt() is called only for data, not headers, but * since we may allocate significant amount of memory here, let ARC * grow its arc_c. */ arc_adapt(log_entries * HDR_L2ONLY_SIZE, arc_l2c_only); for (int i = log_entries - 1; i >= 0; i--) { /* * Restore goes in the reverse temporal direction to preserve * correct temporal ordering of buffers in the l2ad_buflist. * l2arc_hdr_restore also does a list_insert_tail instead of * list_insert_head on the l2ad_buflist: * * LIST l2ad_buflist LIST * HEAD <------ (time) ------ TAIL * direction +-----+-----+-----+-----+-----+ direction * of l2arc <== | buf | buf | buf | buf | buf | ===> of rebuild * fill +-----+-----+-----+-----+-----+ * ^ ^ * | | * | | * l2arc_feed_thread l2arc_rebuild * will place new bufs here restores bufs here * * During l2arc_rebuild() the device is not used by * l2arc_feed_thread() as dev->l2ad_rebuild is set to true. */ size += L2BLK_GET_LSIZE((&lb->lb_entries[i])->le_prop); asize += vdev_psize_to_asize(dev->l2ad_vdev, L2BLK_GET_PSIZE((&lb->lb_entries[i])->le_prop)); l2arc_hdr_restore(&lb->lb_entries[i], dev); } /* * Record rebuild stats: * size Logical size of restored buffers in the L2ARC * asize Aligned size of restored buffers in the L2ARC */ ARCSTAT_INCR(arcstat_l2_rebuild_size, size); ARCSTAT_INCR(arcstat_l2_rebuild_asize, asize); ARCSTAT_INCR(arcstat_l2_rebuild_bufs, log_entries); ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_asize, lb_asize); ARCSTAT_F_AVG(arcstat_l2_data_to_meta_ratio, asize / lb_asize); ARCSTAT_BUMP(arcstat_l2_rebuild_log_blks); } /* * Restores a single ARC buf hdr from a log entry. The ARC buffer is put * into a state indicating that it has been evicted to L2ARC. */ static void l2arc_hdr_restore(const l2arc_log_ent_phys_t *le, l2arc_dev_t *dev) { arc_buf_hdr_t *hdr, *exists; kmutex_t *hash_lock; arc_buf_contents_t type = L2BLK_GET_TYPE((le)->le_prop); uint64_t asize; /* * Do all the allocation before grabbing any locks, this lets us * sleep if memory is full and we don't have to deal with failed * allocations. */ hdr = arc_buf_alloc_l2only(L2BLK_GET_LSIZE((le)->le_prop), type, dev, le->le_dva, le->le_daddr, L2BLK_GET_PSIZE((le)->le_prop), le->le_birth, L2BLK_GET_COMPRESS((le)->le_prop), le->le_complevel, L2BLK_GET_PROTECTED((le)->le_prop), L2BLK_GET_PREFETCH((le)->le_prop), L2BLK_GET_STATE((le)->le_prop)); asize = vdev_psize_to_asize(dev->l2ad_vdev, L2BLK_GET_PSIZE((le)->le_prop)); /* * vdev_space_update() has to be called before arc_hdr_destroy() to * avoid underflow since the latter also calls vdev_space_update(). */ l2arc_hdr_arcstats_increment(hdr); vdev_space_update(dev->l2ad_vdev, asize, 0, 0); mutex_enter(&dev->l2ad_mtx); list_insert_tail(&dev->l2ad_buflist, hdr); (void) zfs_refcount_add_many(&dev->l2ad_alloc, arc_hdr_size(hdr), hdr); mutex_exit(&dev->l2ad_mtx); exists = buf_hash_insert(hdr, &hash_lock); if (exists) { /* Buffer was already cached, no need to restore it. */ arc_hdr_destroy(hdr); /* * If the buffer is already cached, check whether it has * L2ARC metadata. If not, enter them and update the flag. * This is important is case of onlining a cache device, since * we previously evicted all L2ARC metadata from ARC. */ if (!HDR_HAS_L2HDR(exists)) { arc_hdr_set_flags(exists, ARC_FLAG_HAS_L2HDR); exists->b_l2hdr.b_dev = dev; exists->b_l2hdr.b_daddr = le->le_daddr; exists->b_l2hdr.b_arcs_state = L2BLK_GET_STATE((le)->le_prop); mutex_enter(&dev->l2ad_mtx); list_insert_tail(&dev->l2ad_buflist, exists); (void) zfs_refcount_add_many(&dev->l2ad_alloc, arc_hdr_size(exists), exists); mutex_exit(&dev->l2ad_mtx); l2arc_hdr_arcstats_increment(exists); vdev_space_update(dev->l2ad_vdev, asize, 0, 0); } ARCSTAT_BUMP(arcstat_l2_rebuild_bufs_precached); } mutex_exit(hash_lock); } /* * Starts an asynchronous read IO to read a log block. This is used in log * block reconstruction to start reading the next block before we are done * decoding and reconstructing the current block, to keep the l2arc device * nice and hot with read IO to process. * The returned zio will contain a newly allocated memory buffers for the IO * data which should then be freed by the caller once the zio is no longer * needed (i.e. due to it having completed). If you wish to abort this * zio, you should do so using l2arc_log_blk_fetch_abort, which takes * care of disposing of the allocated buffers correctly. */ static zio_t * l2arc_log_blk_fetch(vdev_t *vd, const l2arc_log_blkptr_t *lbp, l2arc_log_blk_phys_t *lb) { uint32_t asize; zio_t *pio; l2arc_read_callback_t *cb; /* L2BLK_GET_PSIZE returns aligned size for log blocks */ asize = L2BLK_GET_PSIZE((lbp)->lbp_prop); ASSERT(asize <= sizeof (l2arc_log_blk_phys_t)); cb = kmem_zalloc(sizeof (l2arc_read_callback_t), KM_SLEEP); cb->l2rcb_abd = abd_get_from_buf(lb, asize); pio = zio_root(vd->vdev_spa, l2arc_blk_fetch_done, cb, ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY); (void) zio_nowait(zio_read_phys(pio, vd, lbp->lbp_daddr, asize, cb->l2rcb_abd, ZIO_CHECKSUM_OFF, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE)); return (pio); } /* * Aborts a zio returned from l2arc_log_blk_fetch and frees the data * buffers allocated for it. */ static void l2arc_log_blk_fetch_abort(zio_t *zio) { (void) zio_wait(zio); } /* * Creates a zio to update the device header on an l2arc device. */ void l2arc_dev_hdr_update(l2arc_dev_t *dev) { l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr; const uint64_t l2dhdr_asize = dev->l2ad_dev_hdr_asize; abd_t *abd; int err; VERIFY(spa_config_held(dev->l2ad_spa, SCL_STATE_ALL, RW_READER)); l2dhdr->dh_magic = L2ARC_DEV_HDR_MAGIC; l2dhdr->dh_version = L2ARC_PERSISTENT_VERSION; l2dhdr->dh_spa_guid = spa_guid(dev->l2ad_vdev->vdev_spa); l2dhdr->dh_vdev_guid = dev->l2ad_vdev->vdev_guid; l2dhdr->dh_log_entries = dev->l2ad_log_entries; l2dhdr->dh_evict = dev->l2ad_evict; l2dhdr->dh_start = dev->l2ad_start; l2dhdr->dh_end = dev->l2ad_end; l2dhdr->dh_lb_asize = zfs_refcount_count(&dev->l2ad_lb_asize); l2dhdr->dh_lb_count = zfs_refcount_count(&dev->l2ad_lb_count); l2dhdr->dh_flags = 0; l2dhdr->dh_trim_action_time = dev->l2ad_vdev->vdev_trim_action_time; l2dhdr->dh_trim_state = dev->l2ad_vdev->vdev_trim_state; if (dev->l2ad_first) l2dhdr->dh_flags |= L2ARC_DEV_HDR_EVICT_FIRST; abd = abd_get_from_buf(l2dhdr, l2dhdr_asize); err = zio_wait(zio_write_phys(NULL, dev->l2ad_vdev, VDEV_LABEL_START_SIZE, l2dhdr_asize, abd, ZIO_CHECKSUM_LABEL, NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE)); abd_free(abd); if (err != 0) { zfs_dbgmsg("L2ARC IO error (%d) while writing device header, " "vdev guid: %llu", err, (u_longlong_t)dev->l2ad_vdev->vdev_guid); } } /* * Commits a log block to the L2ARC device. This routine is invoked from * l2arc_write_buffers when the log block fills up. * This function allocates some memory to temporarily hold the serialized * buffer to be written. This is then released in l2arc_write_done. */ static void l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb) { l2arc_log_blk_phys_t *lb = &dev->l2ad_log_blk; l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr; uint64_t psize, asize; zio_t *wzio; l2arc_lb_abd_buf_t *abd_buf; uint8_t *tmpbuf; l2arc_lb_ptr_buf_t *lb_ptr_buf; VERIFY3S(dev->l2ad_log_ent_idx, ==, dev->l2ad_log_entries); tmpbuf = zio_buf_alloc(sizeof (*lb)); abd_buf = zio_buf_alloc(sizeof (*abd_buf)); abd_buf->abd = abd_get_from_buf(lb, sizeof (*lb)); lb_ptr_buf = kmem_zalloc(sizeof (l2arc_lb_ptr_buf_t), KM_SLEEP); lb_ptr_buf->lb_ptr = kmem_zalloc(sizeof (l2arc_log_blkptr_t), KM_SLEEP); /* link the buffer into the block chain */ lb->lb_prev_lbp = l2dhdr->dh_start_lbps[1]; lb->lb_magic = L2ARC_LOG_BLK_MAGIC; /* * l2arc_log_blk_commit() may be called multiple times during a single * l2arc_write_buffers() call. Save the allocated abd buffers in a list * so we can free them in l2arc_write_done() later on. */ list_insert_tail(&cb->l2wcb_abd_list, abd_buf); /* try to compress the buffer */ psize = zio_compress_data(ZIO_COMPRESS_LZ4, abd_buf->abd, tmpbuf, sizeof (*lb), 0); /* a log block is never entirely zero */ ASSERT(psize != 0); asize = vdev_psize_to_asize(dev->l2ad_vdev, psize); ASSERT(asize <= sizeof (*lb)); /* * Update the start log block pointer in the device header to point * to the log block we're about to write. */ l2dhdr->dh_start_lbps[1] = l2dhdr->dh_start_lbps[0]; l2dhdr->dh_start_lbps[0].lbp_daddr = dev->l2ad_hand; l2dhdr->dh_start_lbps[0].lbp_payload_asize = dev->l2ad_log_blk_payload_asize; l2dhdr->dh_start_lbps[0].lbp_payload_start = dev->l2ad_log_blk_payload_start; L2BLK_SET_LSIZE( (&l2dhdr->dh_start_lbps[0])->lbp_prop, sizeof (*lb)); L2BLK_SET_PSIZE( (&l2dhdr->dh_start_lbps[0])->lbp_prop, asize); L2BLK_SET_CHECKSUM( (&l2dhdr->dh_start_lbps[0])->lbp_prop, ZIO_CHECKSUM_FLETCHER_4); if (asize < sizeof (*lb)) { /* compression succeeded */ memset(tmpbuf + psize, 0, asize - psize); L2BLK_SET_COMPRESS( (&l2dhdr->dh_start_lbps[0])->lbp_prop, ZIO_COMPRESS_LZ4); } else { /* compression failed */ memcpy(tmpbuf, lb, sizeof (*lb)); L2BLK_SET_COMPRESS( (&l2dhdr->dh_start_lbps[0])->lbp_prop, ZIO_COMPRESS_OFF); } /* checksum what we're about to write */ fletcher_4_native(tmpbuf, asize, NULL, &l2dhdr->dh_start_lbps[0].lbp_cksum); abd_free(abd_buf->abd); /* perform the write itself */ abd_buf->abd = abd_get_from_buf(tmpbuf, sizeof (*lb)); abd_take_ownership_of_buf(abd_buf->abd, B_TRUE); wzio = zio_write_phys(pio, dev->l2ad_vdev, dev->l2ad_hand, asize, abd_buf->abd, ZIO_CHECKSUM_OFF, NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE); DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, zio_t *, wzio); (void) zio_nowait(wzio); dev->l2ad_hand += asize; /* * Include the committed log block's pointer in the list of pointers * to log blocks present in the L2ARC device. */ memcpy(lb_ptr_buf->lb_ptr, &l2dhdr->dh_start_lbps[0], sizeof (l2arc_log_blkptr_t)); mutex_enter(&dev->l2ad_mtx); list_insert_head(&dev->l2ad_lbptr_list, lb_ptr_buf); ARCSTAT_INCR(arcstat_l2_log_blk_asize, asize); ARCSTAT_BUMP(arcstat_l2_log_blk_count); zfs_refcount_add_many(&dev->l2ad_lb_asize, asize, lb_ptr_buf); zfs_refcount_add(&dev->l2ad_lb_count, lb_ptr_buf); mutex_exit(&dev->l2ad_mtx); vdev_space_update(dev->l2ad_vdev, asize, 0, 0); /* bump the kstats */ ARCSTAT_INCR(arcstat_l2_write_bytes, asize); ARCSTAT_BUMP(arcstat_l2_log_blk_writes); ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_asize, asize); ARCSTAT_F_AVG(arcstat_l2_data_to_meta_ratio, dev->l2ad_log_blk_payload_asize / asize); /* start a new log block */ dev->l2ad_log_ent_idx = 0; dev->l2ad_log_blk_payload_asize = 0; dev->l2ad_log_blk_payload_start = 0; } /* * Validates an L2ARC log block address to make sure that it can be read * from the provided L2ARC device. */ boolean_t l2arc_log_blkptr_valid(l2arc_dev_t *dev, const l2arc_log_blkptr_t *lbp) { /* L2BLK_GET_PSIZE returns aligned size for log blocks */ uint64_t asize = L2BLK_GET_PSIZE((lbp)->lbp_prop); uint64_t end = lbp->lbp_daddr + asize - 1; uint64_t start = lbp->lbp_payload_start; boolean_t evicted = B_FALSE; /* * A log block is valid if all of the following conditions are true: * - it fits entirely (including its payload) between l2ad_start and * l2ad_end * - it has a valid size * - neither the log block itself nor part of its payload was evicted * by l2arc_evict(): * * l2ad_hand l2ad_evict * | | lbp_daddr * | start | | end * | | | | | * V V V V V * l2ad_start ============================================ l2ad_end * --------------------------|||| * ^ ^ * | log block * payload */ evicted = l2arc_range_check_overlap(start, end, dev->l2ad_hand) || l2arc_range_check_overlap(start, end, dev->l2ad_evict) || l2arc_range_check_overlap(dev->l2ad_hand, dev->l2ad_evict, start) || l2arc_range_check_overlap(dev->l2ad_hand, dev->l2ad_evict, end); return (start >= dev->l2ad_start && end <= dev->l2ad_end && asize > 0 && asize <= sizeof (l2arc_log_blk_phys_t) && (!evicted || dev->l2ad_first)); } /* * Inserts ARC buffer header `hdr' into the current L2ARC log block on * the device. The buffer being inserted must be present in L2ARC. * Returns B_TRUE if the L2ARC log block is full and needs to be committed * to L2ARC, or B_FALSE if it still has room for more ARC buffers. */ static boolean_t l2arc_log_blk_insert(l2arc_dev_t *dev, const arc_buf_hdr_t *hdr) { l2arc_log_blk_phys_t *lb = &dev->l2ad_log_blk; l2arc_log_ent_phys_t *le; if (dev->l2ad_log_entries == 0) return (B_FALSE); int index = dev->l2ad_log_ent_idx++; ASSERT3S(index, <, dev->l2ad_log_entries); ASSERT(HDR_HAS_L2HDR(hdr)); le = &lb->lb_entries[index]; memset(le, 0, sizeof (*le)); le->le_dva = hdr->b_dva; le->le_birth = hdr->b_birth; le->le_daddr = hdr->b_l2hdr.b_daddr; if (index == 0) dev->l2ad_log_blk_payload_start = le->le_daddr; L2BLK_SET_LSIZE((le)->le_prop, HDR_GET_LSIZE(hdr)); L2BLK_SET_PSIZE((le)->le_prop, HDR_GET_PSIZE(hdr)); L2BLK_SET_COMPRESS((le)->le_prop, HDR_GET_COMPRESS(hdr)); le->le_complevel = hdr->b_complevel; L2BLK_SET_TYPE((le)->le_prop, hdr->b_type); L2BLK_SET_PROTECTED((le)->le_prop, !!(HDR_PROTECTED(hdr))); L2BLK_SET_PREFETCH((le)->le_prop, !!(HDR_PREFETCH(hdr))); L2BLK_SET_STATE((le)->le_prop, hdr->b_l1hdr.b_state->arcs_state); dev->l2ad_log_blk_payload_asize += vdev_psize_to_asize(dev->l2ad_vdev, HDR_GET_PSIZE(hdr)); return (dev->l2ad_log_ent_idx == dev->l2ad_log_entries); } /* * Checks whether a given L2ARC device address sits in a time-sequential * range. The trick here is that the L2ARC is a rotary buffer, so we can't * just do a range comparison, we need to handle the situation in which the * range wraps around the end of the L2ARC device. Arguments: * bottom -- Lower end of the range to check (written to earlier). * top -- Upper end of the range to check (written to later). * check -- The address for which we want to determine if it sits in * between the top and bottom. * * The 3-way conditional below represents the following cases: * * bottom < top : Sequentially ordered case: * --------+-------------------+ * | (overlap here?) | * L2ARC dev V V * |---------------============--------------| * * bottom > top: Looped-around case: * --------+------------------+ * | (overlap here?) | * L2ARC dev V V * |===============---------------===========| * ^ ^ * | (or here?) | * +---------------+--------- * * top == bottom : Just a single address comparison. */ boolean_t l2arc_range_check_overlap(uint64_t bottom, uint64_t top, uint64_t check) { if (bottom < top) return (bottom <= check && check <= top); else if (bottom > top) return (check <= top || bottom <= check); else return (check == top); } EXPORT_SYMBOL(arc_buf_size); EXPORT_SYMBOL(arc_write); EXPORT_SYMBOL(arc_read); EXPORT_SYMBOL(arc_buf_info); EXPORT_SYMBOL(arc_getbuf_func); EXPORT_SYMBOL(arc_add_prune_callback); EXPORT_SYMBOL(arc_remove_prune_callback); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, min, param_set_arc_min, spl_param_get_u64, ZMOD_RW, "Minimum ARC size in bytes"); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, max, param_set_arc_max, spl_param_get_u64, ZMOD_RW, "Maximum ARC size in bytes"); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, meta_limit, param_set_arc_u64, spl_param_get_u64, ZMOD_RW, "Metadata limit for ARC size in bytes"); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, meta_limit_percent, param_set_arc_int, param_get_uint, ZMOD_RW, "Percent of ARC size for ARC meta limit"); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, meta_min, param_set_arc_u64, spl_param_get_u64, ZMOD_RW, "Minimum ARC metadata size in bytes"); ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, meta_prune, INT, ZMOD_RW, "Meta objects to scan for prune"); ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, meta_adjust_restarts, UINT, ZMOD_RW, "Limit number of restarts in arc_evict_meta"); ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, meta_strategy, UINT, ZMOD_RW, "Meta reclaim strategy"); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, grow_retry, param_set_arc_int, param_get_uint, ZMOD_RW, "Seconds before growing ARC size"); ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, p_dampener_disable, INT, ZMOD_RW, "Disable arc_p adapt dampener"); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, shrink_shift, param_set_arc_int, param_get_uint, ZMOD_RW, "log2(fraction of ARC to reclaim)"); ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, pc_percent, UINT, ZMOD_RW, "Percent of pagecache to reclaim ARC to"); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, p_min_shift, param_set_arc_int, param_get_uint, ZMOD_RW, "arc_c shift to calc min/max arc_p"); ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, average_blocksize, UINT, ZMOD_RD, "Target average block size"); ZFS_MODULE_PARAM(zfs, zfs_, compressed_arc_enabled, INT, ZMOD_RW, "Disable compressed ARC buffers"); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, min_prefetch_ms, param_set_arc_int, param_get_uint, ZMOD_RW, "Min life of prefetch block in ms"); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, min_prescient_prefetch_ms, param_set_arc_int, param_get_uint, ZMOD_RW, "Min life of prescient prefetched block in ms"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, write_max, U64, ZMOD_RW, "Max write bytes per interval"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, write_boost, U64, ZMOD_RW, "Extra write bytes during device warmup"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, headroom, U64, ZMOD_RW, "Number of max device writes to precache"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, headroom_boost, U64, ZMOD_RW, "Compressed l2arc_headroom multiplier"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, trim_ahead, U64, ZMOD_RW, "TRIM ahead L2ARC write size multiplier"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, feed_secs, U64, ZMOD_RW, "Seconds between L2ARC writing"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, feed_min_ms, U64, ZMOD_RW, "Min feed interval in milliseconds"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, noprefetch, INT, ZMOD_RW, "Skip caching prefetched buffers"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, feed_again, INT, ZMOD_RW, "Turbo L2ARC warmup"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, norw, INT, ZMOD_RW, "No reads during writes"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, meta_percent, UINT, ZMOD_RW, "Percent of ARC size allowed for L2ARC-only headers"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, rebuild_enabled, INT, ZMOD_RW, "Rebuild the L2ARC when importing a pool"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, rebuild_blocks_min_l2size, U64, ZMOD_RW, "Min size in bytes to write rebuild log blocks in L2ARC"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, mfuonly, INT, ZMOD_RW, "Cache only MFU data from ARC into L2ARC"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, exclude_special, INT, ZMOD_RW, "Exclude dbufs on special vdevs from being cached to L2ARC if set."); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, lotsfree_percent, param_set_arc_int, param_get_uint, ZMOD_RW, "System free memory I/O throttle in bytes"); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, sys_free, param_set_arc_u64, spl_param_get_u64, ZMOD_RW, "System free memory target size in bytes"); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, dnode_limit, param_set_arc_u64, spl_param_get_u64, ZMOD_RW, "Minimum bytes of dnodes in ARC"); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, dnode_limit_percent, param_set_arc_int, param_get_uint, ZMOD_RW, "Percent of ARC meta buffers for dnodes"); ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, dnode_reduce_percent, UINT, ZMOD_RW, "Percentage of excess dnodes to try to unpin"); ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, eviction_pct, UINT, ZMOD_RW, "When full, ARC allocation waits for eviction of this % of alloc size"); ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, evict_batch_limit, UINT, ZMOD_RW, "The number of headers to evict per sublist before moving to the next"); ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, prune_task_threads, INT, ZMOD_RW, "Number of arc_prune threads"); diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 77e6ad23ef89..7982d9702896 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -1,5144 +1,5143 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2012, 2020 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright (c) 2019, Klara Inc. * Copyright (c) 2019, Allan Jude */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static kstat_t *dbuf_ksp; typedef struct dbuf_stats { /* * Various statistics about the size of the dbuf cache. */ kstat_named_t cache_count; kstat_named_t cache_size_bytes; kstat_named_t cache_size_bytes_max; /* * Statistics regarding the bounds on the dbuf cache size. */ kstat_named_t cache_target_bytes; kstat_named_t cache_lowater_bytes; kstat_named_t cache_hiwater_bytes; /* * Total number of dbuf cache evictions that have occurred. */ kstat_named_t cache_total_evicts; /* * The distribution of dbuf levels in the dbuf cache and * the total size of all dbufs at each level. */ kstat_named_t cache_levels[DN_MAX_LEVELS]; kstat_named_t cache_levels_bytes[DN_MAX_LEVELS]; /* * Statistics about the dbuf hash table. */ kstat_named_t hash_hits; kstat_named_t hash_misses; kstat_named_t hash_collisions; kstat_named_t hash_elements; kstat_named_t hash_elements_max; /* * Number of sublists containing more than one dbuf in the dbuf * hash table. Keep track of the longest hash chain. */ kstat_named_t hash_chains; kstat_named_t hash_chain_max; /* * Number of times a dbuf_create() discovers that a dbuf was * already created and in the dbuf hash table. */ kstat_named_t hash_insert_race; /* * Number of entries in the hash table dbuf and mutex arrays. */ kstat_named_t hash_table_count; kstat_named_t hash_mutex_count; /* * Statistics about the size of the metadata dbuf cache. */ kstat_named_t metadata_cache_count; kstat_named_t metadata_cache_size_bytes; kstat_named_t metadata_cache_size_bytes_max; /* * For diagnostic purposes, this is incremented whenever we can't add * something to the metadata cache because it's full, and instead put * the data in the regular dbuf cache. */ kstat_named_t metadata_cache_overflow; } dbuf_stats_t; dbuf_stats_t dbuf_stats = { { "cache_count", KSTAT_DATA_UINT64 }, { "cache_size_bytes", KSTAT_DATA_UINT64 }, { "cache_size_bytes_max", KSTAT_DATA_UINT64 }, { "cache_target_bytes", KSTAT_DATA_UINT64 }, { "cache_lowater_bytes", KSTAT_DATA_UINT64 }, { "cache_hiwater_bytes", KSTAT_DATA_UINT64 }, { "cache_total_evicts", KSTAT_DATA_UINT64 }, { { "cache_levels_N", KSTAT_DATA_UINT64 } }, { { "cache_levels_bytes_N", KSTAT_DATA_UINT64 } }, { "hash_hits", KSTAT_DATA_UINT64 }, { "hash_misses", KSTAT_DATA_UINT64 }, { "hash_collisions", KSTAT_DATA_UINT64 }, { "hash_elements", KSTAT_DATA_UINT64 }, { "hash_elements_max", KSTAT_DATA_UINT64 }, { "hash_chains", KSTAT_DATA_UINT64 }, { "hash_chain_max", KSTAT_DATA_UINT64 }, { "hash_insert_race", KSTAT_DATA_UINT64 }, { "hash_table_count", KSTAT_DATA_UINT64 }, { "hash_mutex_count", KSTAT_DATA_UINT64 }, { "metadata_cache_count", KSTAT_DATA_UINT64 }, { "metadata_cache_size_bytes", KSTAT_DATA_UINT64 }, { "metadata_cache_size_bytes_max", KSTAT_DATA_UINT64 }, { "metadata_cache_overflow", KSTAT_DATA_UINT64 } }; struct { wmsum_t cache_count; wmsum_t cache_total_evicts; wmsum_t cache_levels[DN_MAX_LEVELS]; wmsum_t cache_levels_bytes[DN_MAX_LEVELS]; wmsum_t hash_hits; wmsum_t hash_misses; wmsum_t hash_collisions; wmsum_t hash_chains; wmsum_t hash_insert_race; wmsum_t metadata_cache_count; wmsum_t metadata_cache_overflow; } dbuf_sums; #define DBUF_STAT_INCR(stat, val) \ wmsum_add(&dbuf_sums.stat, val); #define DBUF_STAT_DECR(stat, val) \ DBUF_STAT_INCR(stat, -(val)); #define DBUF_STAT_BUMP(stat) \ DBUF_STAT_INCR(stat, 1); #define DBUF_STAT_BUMPDOWN(stat) \ DBUF_STAT_INCR(stat, -1); #define DBUF_STAT_MAX(stat, v) { \ uint64_t _m; \ while ((v) > (_m = dbuf_stats.stat.value.ui64) && \ (_m != atomic_cas_64(&dbuf_stats.stat.value.ui64, _m, (v))))\ continue; \ } static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx); static void dbuf_sync_leaf_verify_bonus_dnode(dbuf_dirty_record_t *dr); static int dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, uint32_t flags); /* * Global data structures and functions for the dbuf cache. */ static kmem_cache_t *dbuf_kmem_cache; static taskq_t *dbu_evict_taskq; static kthread_t *dbuf_cache_evict_thread; static kmutex_t dbuf_evict_lock; static kcondvar_t dbuf_evict_cv; static boolean_t dbuf_evict_thread_exit; /* * There are two dbuf caches; each dbuf can only be in one of them at a time. * * 1. Cache of metadata dbufs, to help make read-heavy administrative commands * from /sbin/zfs run faster. The "metadata cache" specifically stores dbufs * that represent the metadata that describes filesystems/snapshots/ * bookmarks/properties/etc. We only evict from this cache when we export a * pool, to short-circuit as much I/O as possible for all administrative * commands that need the metadata. There is no eviction policy for this * cache, because we try to only include types in it which would occupy a * very small amount of space per object but create a large impact on the * performance of these commands. Instead, after it reaches a maximum size * (which should only happen on very small memory systems with a very large * number of filesystem objects), we stop taking new dbufs into the * metadata cache, instead putting them in the normal dbuf cache. * * 2. LRU cache of dbufs. The dbuf cache maintains a list of dbufs that * are not currently held but have been recently released. These dbufs * are not eligible for arc eviction until they are aged out of the cache. * Dbufs that are aged out of the cache will be immediately destroyed and * become eligible for arc eviction. * * Dbufs are added to these caches once the last hold is released. If a dbuf is * later accessed and still exists in the dbuf cache, then it will be removed * from the cache and later re-added to the head of the cache. * * If a given dbuf meets the requirements for the metadata cache, it will go * there, otherwise it will be considered for the generic LRU dbuf cache. The * caches and the refcounts tracking their sizes are stored in an array indexed * by those caches' matching enum values (from dbuf_cached_state_t). */ typedef struct dbuf_cache { multilist_t cache; zfs_refcount_t size ____cacheline_aligned; } dbuf_cache_t; dbuf_cache_t dbuf_caches[DB_CACHE_MAX]; /* Size limits for the caches */ static uint64_t dbuf_cache_max_bytes = UINT64_MAX; static uint64_t dbuf_metadata_cache_max_bytes = UINT64_MAX; /* Set the default sizes of the caches to log2 fraction of arc size */ static uint_t dbuf_cache_shift = 5; static uint_t dbuf_metadata_cache_shift = 6; /* Set the dbuf hash mutex count as log2 shift (dynamic by default) */ static uint_t dbuf_mutex_cache_shift = 0; static unsigned long dbuf_cache_target_bytes(void); static unsigned long dbuf_metadata_cache_target_bytes(void); /* * The LRU dbuf cache uses a three-stage eviction policy: * - A low water marker designates when the dbuf eviction thread * should stop evicting from the dbuf cache. * - When we reach the maximum size (aka mid water mark), we * signal the eviction thread to run. * - The high water mark indicates when the eviction thread * is unable to keep up with the incoming load and eviction must * happen in the context of the calling thread. * * The dbuf cache: * (max size) * low water mid water hi water * +----------------------------------------+----------+----------+ * | | | | * | | | | * | | | | * | | | | * +----------------------------------------+----------+----------+ * stop signal evict * evicting eviction directly * thread * * The high and low water marks indicate the operating range for the eviction * thread. The low water mark is, by default, 90% of the total size of the * cache and the high water mark is at 110% (both of these percentages can be * changed by setting dbuf_cache_lowater_pct and dbuf_cache_hiwater_pct, * respectively). The eviction thread will try to ensure that the cache remains * within this range by waking up every second and checking if the cache is * above the low water mark. The thread can also be woken up by callers adding * elements into the cache if the cache is larger than the mid water (i.e max * cache size). Once the eviction thread is woken up and eviction is required, * it will continue evicting buffers until it's able to reduce the cache size * to the low water mark. If the cache size continues to grow and hits the high * water mark, then callers adding elements to the cache will begin to evict * directly from the cache until the cache is no longer above the high water * mark. */ /* * The percentage above and below the maximum cache size. */ static uint_t dbuf_cache_hiwater_pct = 10; static uint_t dbuf_cache_lowater_pct = 10; static int dbuf_cons(void *vdb, void *unused, int kmflag) { (void) unused, (void) kmflag; dmu_buf_impl_t *db = vdb; memset(db, 0, sizeof (dmu_buf_impl_t)); mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL); rw_init(&db->db_rwlock, NULL, RW_DEFAULT, NULL); cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL); multilist_link_init(&db->db_cache_link); zfs_refcount_create(&db->db_holds); return (0); } static void dbuf_dest(void *vdb, void *unused) { (void) unused; dmu_buf_impl_t *db = vdb; mutex_destroy(&db->db_mtx); rw_destroy(&db->db_rwlock); cv_destroy(&db->db_changed); ASSERT(!multilist_link_active(&db->db_cache_link)); zfs_refcount_destroy(&db->db_holds); } /* * dbuf hash table routines */ static dbuf_hash_table_t dbuf_hash_table; /* * We use Cityhash for this. It's fast, and has good hash properties without * requiring any large static buffers. */ static uint64_t dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid) { return (cityhash4((uintptr_t)os, obj, (uint64_t)lvl, blkid)); } #define DTRACE_SET_STATE(db, why) \ DTRACE_PROBE2(dbuf__state_change, dmu_buf_impl_t *, db, \ const char *, why) #define DBUF_EQUAL(dbuf, os, obj, level, blkid) \ ((dbuf)->db.db_object == (obj) && \ (dbuf)->db_objset == (os) && \ (dbuf)->db_level == (level) && \ (dbuf)->db_blkid == (blkid)) dmu_buf_impl_t * dbuf_find(objset_t *os, uint64_t obj, uint8_t level, uint64_t blkid) { dbuf_hash_table_t *h = &dbuf_hash_table; uint64_t hv; uint64_t idx; dmu_buf_impl_t *db; hv = dbuf_hash(os, obj, level, blkid); idx = hv & h->hash_table_mask; mutex_enter(DBUF_HASH_MUTEX(h, idx)); for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) { if (DBUF_EQUAL(db, os, obj, level, blkid)) { mutex_enter(&db->db_mtx); if (db->db_state != DB_EVICTING) { mutex_exit(DBUF_HASH_MUTEX(h, idx)); return (db); } mutex_exit(&db->db_mtx); } } mutex_exit(DBUF_HASH_MUTEX(h, idx)); return (NULL); } static dmu_buf_impl_t * dbuf_find_bonus(objset_t *os, uint64_t object) { dnode_t *dn; dmu_buf_impl_t *db = NULL; if (dnode_hold(os, object, FTAG, &dn) == 0) { rw_enter(&dn->dn_struct_rwlock, RW_READER); if (dn->dn_bonus != NULL) { db = dn->dn_bonus; mutex_enter(&db->db_mtx); } rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); } return (db); } /* * Insert an entry into the hash table. If there is already an element * equal to elem in the hash table, then the already existing element * will be returned and the new element will not be inserted. * Otherwise returns NULL. */ static dmu_buf_impl_t * dbuf_hash_insert(dmu_buf_impl_t *db) { dbuf_hash_table_t *h = &dbuf_hash_table; objset_t *os = db->db_objset; uint64_t obj = db->db.db_object; int level = db->db_level; uint64_t blkid, hv, idx; dmu_buf_impl_t *dbf; uint32_t i; blkid = db->db_blkid; hv = dbuf_hash(os, obj, level, blkid); idx = hv & h->hash_table_mask; mutex_enter(DBUF_HASH_MUTEX(h, idx)); for (dbf = h->hash_table[idx], i = 0; dbf != NULL; dbf = dbf->db_hash_next, i++) { if (DBUF_EQUAL(dbf, os, obj, level, blkid)) { mutex_enter(&dbf->db_mtx); if (dbf->db_state != DB_EVICTING) { mutex_exit(DBUF_HASH_MUTEX(h, idx)); return (dbf); } mutex_exit(&dbf->db_mtx); } } if (i > 0) { DBUF_STAT_BUMP(hash_collisions); if (i == 1) DBUF_STAT_BUMP(hash_chains); DBUF_STAT_MAX(hash_chain_max, i); } mutex_enter(&db->db_mtx); db->db_hash_next = h->hash_table[idx]; h->hash_table[idx] = db; mutex_exit(DBUF_HASH_MUTEX(h, idx)); uint64_t he = atomic_inc_64_nv(&dbuf_stats.hash_elements.value.ui64); DBUF_STAT_MAX(hash_elements_max, he); return (NULL); } /* * This returns whether this dbuf should be stored in the metadata cache, which * is based on whether it's from one of the dnode types that store data related * to traversing dataset hierarchies. */ static boolean_t dbuf_include_in_metadata_cache(dmu_buf_impl_t *db) { DB_DNODE_ENTER(db); dmu_object_type_t type = DB_DNODE(db)->dn_type; DB_DNODE_EXIT(db); /* Check if this dbuf is one of the types we care about */ if (DMU_OT_IS_METADATA_CACHED(type)) { /* If we hit this, then we set something up wrong in dmu_ot */ ASSERT(DMU_OT_IS_METADATA(type)); /* * Sanity check for small-memory systems: don't allocate too * much memory for this purpose. */ if (zfs_refcount_count( &dbuf_caches[DB_DBUF_METADATA_CACHE].size) > dbuf_metadata_cache_target_bytes()) { DBUF_STAT_BUMP(metadata_cache_overflow); return (B_FALSE); } return (B_TRUE); } return (B_FALSE); } /* * Remove an entry from the hash table. It must be in the EVICTING state. */ static void dbuf_hash_remove(dmu_buf_impl_t *db) { dbuf_hash_table_t *h = &dbuf_hash_table; uint64_t hv, idx; dmu_buf_impl_t *dbf, **dbp; hv = dbuf_hash(db->db_objset, db->db.db_object, db->db_level, db->db_blkid); idx = hv & h->hash_table_mask; /* * We mustn't hold db_mtx to maintain lock ordering: * DBUF_HASH_MUTEX > db_mtx. */ ASSERT(zfs_refcount_is_zero(&db->db_holds)); ASSERT(db->db_state == DB_EVICTING); ASSERT(!MUTEX_HELD(&db->db_mtx)); mutex_enter(DBUF_HASH_MUTEX(h, idx)); dbp = &h->hash_table[idx]; while ((dbf = *dbp) != db) { dbp = &dbf->db_hash_next; ASSERT(dbf != NULL); } *dbp = db->db_hash_next; db->db_hash_next = NULL; if (h->hash_table[idx] && h->hash_table[idx]->db_hash_next == NULL) DBUF_STAT_BUMPDOWN(hash_chains); mutex_exit(DBUF_HASH_MUTEX(h, idx)); atomic_dec_64(&dbuf_stats.hash_elements.value.ui64); } typedef enum { DBVU_EVICTING, DBVU_NOT_EVICTING } dbvu_verify_type_t; static void dbuf_verify_user(dmu_buf_impl_t *db, dbvu_verify_type_t verify_type) { #ifdef ZFS_DEBUG int64_t holds; if (db->db_user == NULL) return; /* Only data blocks support the attachment of user data. */ ASSERT(db->db_level == 0); /* Clients must resolve a dbuf before attaching user data. */ ASSERT(db->db.db_data != NULL); ASSERT3U(db->db_state, ==, DB_CACHED); holds = zfs_refcount_count(&db->db_holds); if (verify_type == DBVU_EVICTING) { /* * Immediate eviction occurs when holds == dirtycnt. * For normal eviction buffers, holds is zero on * eviction, except when dbuf_fix_old_data() calls * dbuf_clear_data(). However, the hold count can grow * during eviction even though db_mtx is held (see * dmu_bonus_hold() for an example), so we can only * test the generic invariant that holds >= dirtycnt. */ ASSERT3U(holds, >=, db->db_dirtycnt); } else { if (db->db_user_immediate_evict == TRUE) ASSERT3U(holds, >=, db->db_dirtycnt); else ASSERT3U(holds, >, 0); } #endif } static void dbuf_evict_user(dmu_buf_impl_t *db) { dmu_buf_user_t *dbu = db->db_user; ASSERT(MUTEX_HELD(&db->db_mtx)); if (dbu == NULL) return; dbuf_verify_user(db, DBVU_EVICTING); db->db_user = NULL; #ifdef ZFS_DEBUG if (dbu->dbu_clear_on_evict_dbufp != NULL) *dbu->dbu_clear_on_evict_dbufp = NULL; #endif /* * There are two eviction callbacks - one that we call synchronously * and one that we invoke via a taskq. The async one is useful for * avoiding lock order reversals and limiting stack depth. * * Note that if we have a sync callback but no async callback, * it's likely that the sync callback will free the structure * containing the dbu. In that case we need to take care to not * dereference dbu after calling the sync evict func. */ boolean_t has_async = (dbu->dbu_evict_func_async != NULL); if (dbu->dbu_evict_func_sync != NULL) dbu->dbu_evict_func_sync(dbu); if (has_async) { taskq_dispatch_ent(dbu_evict_taskq, dbu->dbu_evict_func_async, dbu, 0, &dbu->dbu_tqent); } } boolean_t dbuf_is_metadata(dmu_buf_impl_t *db) { /* * Consider indirect blocks and spill blocks to be meta data. */ if (db->db_level > 0 || db->db_blkid == DMU_SPILL_BLKID) { return (B_TRUE); } else { boolean_t is_metadata; DB_DNODE_ENTER(db); is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type); DB_DNODE_EXIT(db); return (is_metadata); } } /* * We want to exclude buffers that are on a special allocation class from * L2ARC. */ boolean_t dbuf_is_l2cacheable(dmu_buf_impl_t *db) { vdev_t *vd = NULL; zfs_cache_type_t cache = db->db_objset->os_secondary_cache; blkptr_t *bp = db->db_blkptr; if (bp != NULL && !BP_IS_HOLE(bp)) { uint64_t vdev = DVA_GET_VDEV(bp->blk_dva); vdev_t *rvd = db->db_objset->os_spa->spa_root_vdev; if (vdev < rvd->vdev_children) vd = rvd->vdev_child[vdev]; if (cache == ZFS_CACHE_ALL || (dbuf_is_metadata(db) && cache == ZFS_CACHE_METADATA)) { if (vd == NULL) return (B_TRUE); if ((vd->vdev_alloc_bias != VDEV_BIAS_SPECIAL && vd->vdev_alloc_bias != VDEV_BIAS_DEDUP) || l2arc_exclude_special == 0) return (B_TRUE); } } return (B_FALSE); } static inline boolean_t dnode_level_is_l2cacheable(blkptr_t *bp, dnode_t *dn, int64_t level) { vdev_t *vd = NULL; zfs_cache_type_t cache = dn->dn_objset->os_secondary_cache; if (bp != NULL && !BP_IS_HOLE(bp)) { uint64_t vdev = DVA_GET_VDEV(bp->blk_dva); vdev_t *rvd = dn->dn_objset->os_spa->spa_root_vdev; if (vdev < rvd->vdev_children) vd = rvd->vdev_child[vdev]; if (cache == ZFS_CACHE_ALL || ((level > 0 || DMU_OT_IS_METADATA(dn->dn_handle->dnh_dnode->dn_type)) && cache == ZFS_CACHE_METADATA)) { if (vd == NULL) return (B_TRUE); if ((vd->vdev_alloc_bias != VDEV_BIAS_SPECIAL && vd->vdev_alloc_bias != VDEV_BIAS_DEDUP) || l2arc_exclude_special == 0) return (B_TRUE); } } return (B_FALSE); } /* * This function *must* return indices evenly distributed between all * sublists of the multilist. This is needed due to how the dbuf eviction * code is laid out; dbuf_evict_thread() assumes dbufs are evenly * distributed between all sublists and uses this assumption when * deciding which sublist to evict from and how much to evict from it. */ static unsigned int dbuf_cache_multilist_index_func(multilist_t *ml, void *obj) { dmu_buf_impl_t *db = obj; /* * The assumption here, is the hash value for a given * dmu_buf_impl_t will remain constant throughout it's lifetime * (i.e. it's objset, object, level and blkid fields don't change). * Thus, we don't need to store the dbuf's sublist index * on insertion, as this index can be recalculated on removal. * * Also, the low order bits of the hash value are thought to be * distributed evenly. Otherwise, in the case that the multilist * has a power of two number of sublists, each sublists' usage * would not be evenly distributed. In this context full 64bit * division would be a waste of time, so limit it to 32 bits. */ return ((unsigned int)dbuf_hash(db->db_objset, db->db.db_object, db->db_level, db->db_blkid) % multilist_get_num_sublists(ml)); } /* * The target size of the dbuf cache can grow with the ARC target, * unless limited by the tunable dbuf_cache_max_bytes. */ static inline unsigned long dbuf_cache_target_bytes(void) { return (MIN(dbuf_cache_max_bytes, arc_target_bytes() >> dbuf_cache_shift)); } /* * The target size of the dbuf metadata cache can grow with the ARC target, * unless limited by the tunable dbuf_metadata_cache_max_bytes. */ static inline unsigned long dbuf_metadata_cache_target_bytes(void) { return (MIN(dbuf_metadata_cache_max_bytes, arc_target_bytes() >> dbuf_metadata_cache_shift)); } static inline uint64_t dbuf_cache_hiwater_bytes(void) { uint64_t dbuf_cache_target = dbuf_cache_target_bytes(); return (dbuf_cache_target + (dbuf_cache_target * dbuf_cache_hiwater_pct) / 100); } static inline uint64_t dbuf_cache_lowater_bytes(void) { uint64_t dbuf_cache_target = dbuf_cache_target_bytes(); return (dbuf_cache_target - (dbuf_cache_target * dbuf_cache_lowater_pct) / 100); } static inline boolean_t dbuf_cache_above_lowater(void) { return (zfs_refcount_count(&dbuf_caches[DB_DBUF_CACHE].size) > dbuf_cache_lowater_bytes()); } /* * Evict the oldest eligible dbuf from the dbuf cache. */ static void dbuf_evict_one(void) { int idx = multilist_get_random_index(&dbuf_caches[DB_DBUF_CACHE].cache); multilist_sublist_t *mls = multilist_sublist_lock( &dbuf_caches[DB_DBUF_CACHE].cache, idx); ASSERT(!MUTEX_HELD(&dbuf_evict_lock)); dmu_buf_impl_t *db = multilist_sublist_tail(mls); while (db != NULL && mutex_tryenter(&db->db_mtx) == 0) { db = multilist_sublist_prev(mls, db); } DTRACE_PROBE2(dbuf__evict__one, dmu_buf_impl_t *, db, multilist_sublist_t *, mls); if (db != NULL) { multilist_sublist_remove(mls, db); multilist_sublist_unlock(mls); (void) zfs_refcount_remove_many( &dbuf_caches[DB_DBUF_CACHE].size, db->db.db_size, db); DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]); DBUF_STAT_BUMPDOWN(cache_count); DBUF_STAT_DECR(cache_levels_bytes[db->db_level], db->db.db_size); ASSERT3U(db->db_caching_status, ==, DB_DBUF_CACHE); db->db_caching_status = DB_NO_CACHE; dbuf_destroy(db); DBUF_STAT_BUMP(cache_total_evicts); } else { multilist_sublist_unlock(mls); } } /* * The dbuf evict thread is responsible for aging out dbufs from the * cache. Once the cache has reached it's maximum size, dbufs are removed * and destroyed. The eviction thread will continue running until the size * of the dbuf cache is at or below the maximum size. Once the dbuf is aged * out of the cache it is destroyed and becomes eligible for arc eviction. */ static __attribute__((noreturn)) void dbuf_evict_thread(void *unused) { (void) unused; callb_cpr_t cpr; CALLB_CPR_INIT(&cpr, &dbuf_evict_lock, callb_generic_cpr, FTAG); mutex_enter(&dbuf_evict_lock); while (!dbuf_evict_thread_exit) { while (!dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) { CALLB_CPR_SAFE_BEGIN(&cpr); (void) cv_timedwait_idle_hires(&dbuf_evict_cv, &dbuf_evict_lock, SEC2NSEC(1), MSEC2NSEC(1), 0); CALLB_CPR_SAFE_END(&cpr, &dbuf_evict_lock); } mutex_exit(&dbuf_evict_lock); /* * Keep evicting as long as we're above the low water mark * for the cache. We do this without holding the locks to * minimize lock contention. */ while (dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) { dbuf_evict_one(); } mutex_enter(&dbuf_evict_lock); } dbuf_evict_thread_exit = B_FALSE; cv_broadcast(&dbuf_evict_cv); CALLB_CPR_EXIT(&cpr); /* drops dbuf_evict_lock */ thread_exit(); } /* * Wake up the dbuf eviction thread if the dbuf cache is at its max size. * If the dbuf cache is at its high water mark, then evict a dbuf from the * dbuf cache using the caller's context. */ static void dbuf_evict_notify(uint64_t size) { /* * We check if we should evict without holding the dbuf_evict_lock, * because it's OK to occasionally make the wrong decision here, * and grabbing the lock results in massive lock contention. */ if (size > dbuf_cache_target_bytes()) { if (size > dbuf_cache_hiwater_bytes()) dbuf_evict_one(); cv_signal(&dbuf_evict_cv); } } static int dbuf_kstat_update(kstat_t *ksp, int rw) { dbuf_stats_t *ds = ksp->ks_data; dbuf_hash_table_t *h = &dbuf_hash_table; if (rw == KSTAT_WRITE) return (SET_ERROR(EACCES)); ds->cache_count.value.ui64 = wmsum_value(&dbuf_sums.cache_count); ds->cache_size_bytes.value.ui64 = zfs_refcount_count(&dbuf_caches[DB_DBUF_CACHE].size); ds->cache_target_bytes.value.ui64 = dbuf_cache_target_bytes(); ds->cache_hiwater_bytes.value.ui64 = dbuf_cache_hiwater_bytes(); ds->cache_lowater_bytes.value.ui64 = dbuf_cache_lowater_bytes(); ds->cache_total_evicts.value.ui64 = wmsum_value(&dbuf_sums.cache_total_evicts); for (int i = 0; i < DN_MAX_LEVELS; i++) { ds->cache_levels[i].value.ui64 = wmsum_value(&dbuf_sums.cache_levels[i]); ds->cache_levels_bytes[i].value.ui64 = wmsum_value(&dbuf_sums.cache_levels_bytes[i]); } ds->hash_hits.value.ui64 = wmsum_value(&dbuf_sums.hash_hits); ds->hash_misses.value.ui64 = wmsum_value(&dbuf_sums.hash_misses); ds->hash_collisions.value.ui64 = wmsum_value(&dbuf_sums.hash_collisions); ds->hash_chains.value.ui64 = wmsum_value(&dbuf_sums.hash_chains); ds->hash_insert_race.value.ui64 = wmsum_value(&dbuf_sums.hash_insert_race); ds->hash_table_count.value.ui64 = h->hash_table_mask + 1; ds->hash_mutex_count.value.ui64 = h->hash_mutex_mask + 1; ds->metadata_cache_count.value.ui64 = wmsum_value(&dbuf_sums.metadata_cache_count); ds->metadata_cache_size_bytes.value.ui64 = zfs_refcount_count( &dbuf_caches[DB_DBUF_METADATA_CACHE].size); ds->metadata_cache_overflow.value.ui64 = wmsum_value(&dbuf_sums.metadata_cache_overflow); return (0); } void dbuf_init(void) { uint64_t hmsize, hsize = 1ULL << 16; dbuf_hash_table_t *h = &dbuf_hash_table; /* * The hash table is big enough to fill one eighth of physical memory * with an average block size of zfs_arc_average_blocksize (default 8K). * By default, the table will take up * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers). */ while (hsize * zfs_arc_average_blocksize < arc_all_memory() / 8) hsize <<= 1; h->hash_table = NULL; while (h->hash_table == NULL) { h->hash_table_mask = hsize - 1; h->hash_table = vmem_zalloc(hsize * sizeof (void *), KM_SLEEP); if (h->hash_table == NULL) hsize >>= 1; ASSERT3U(hsize, >=, 1ULL << 10); } /* * The hash table buckets are protected by an array of mutexes where * each mutex is reponsible for protecting 128 buckets. A minimum * array size of 8192 is targeted to avoid contention. */ if (dbuf_mutex_cache_shift == 0) hmsize = MAX(hsize >> 7, 1ULL << 13); else hmsize = 1ULL << MIN(dbuf_mutex_cache_shift, 24); h->hash_mutexes = NULL; while (h->hash_mutexes == NULL) { h->hash_mutex_mask = hmsize - 1; h->hash_mutexes = vmem_zalloc(hmsize * sizeof (kmutex_t), KM_SLEEP); if (h->hash_mutexes == NULL) hmsize >>= 1; } dbuf_kmem_cache = kmem_cache_create("dmu_buf_impl_t", sizeof (dmu_buf_impl_t), 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0); for (int i = 0; i < hmsize; i++) mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL); dbuf_stats_init(h); /* * All entries are queued via taskq_dispatch_ent(), so min/maxalloc * configuration is not required. */ dbu_evict_taskq = taskq_create("dbu_evict", 1, defclsyspri, 0, 0, 0); for (dbuf_cached_state_t dcs = 0; dcs < DB_CACHE_MAX; dcs++) { multilist_create(&dbuf_caches[dcs].cache, sizeof (dmu_buf_impl_t), offsetof(dmu_buf_impl_t, db_cache_link), dbuf_cache_multilist_index_func); zfs_refcount_create(&dbuf_caches[dcs].size); } dbuf_evict_thread_exit = B_FALSE; mutex_init(&dbuf_evict_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&dbuf_evict_cv, NULL, CV_DEFAULT, NULL); dbuf_cache_evict_thread = thread_create(NULL, 0, dbuf_evict_thread, NULL, 0, &p0, TS_RUN, minclsyspri); wmsum_init(&dbuf_sums.cache_count, 0); wmsum_init(&dbuf_sums.cache_total_evicts, 0); for (int i = 0; i < DN_MAX_LEVELS; i++) { wmsum_init(&dbuf_sums.cache_levels[i], 0); wmsum_init(&dbuf_sums.cache_levels_bytes[i], 0); } wmsum_init(&dbuf_sums.hash_hits, 0); wmsum_init(&dbuf_sums.hash_misses, 0); wmsum_init(&dbuf_sums.hash_collisions, 0); wmsum_init(&dbuf_sums.hash_chains, 0); wmsum_init(&dbuf_sums.hash_insert_race, 0); wmsum_init(&dbuf_sums.metadata_cache_count, 0); wmsum_init(&dbuf_sums.metadata_cache_overflow, 0); dbuf_ksp = kstat_create("zfs", 0, "dbufstats", "misc", KSTAT_TYPE_NAMED, sizeof (dbuf_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); if (dbuf_ksp != NULL) { for (int i = 0; i < DN_MAX_LEVELS; i++) { snprintf(dbuf_stats.cache_levels[i].name, KSTAT_STRLEN, "cache_level_%d", i); dbuf_stats.cache_levels[i].data_type = KSTAT_DATA_UINT64; snprintf(dbuf_stats.cache_levels_bytes[i].name, KSTAT_STRLEN, "cache_level_%d_bytes", i); dbuf_stats.cache_levels_bytes[i].data_type = KSTAT_DATA_UINT64; } dbuf_ksp->ks_data = &dbuf_stats; dbuf_ksp->ks_update = dbuf_kstat_update; kstat_install(dbuf_ksp); } } void dbuf_fini(void) { dbuf_hash_table_t *h = &dbuf_hash_table; dbuf_stats_destroy(); for (int i = 0; i < (h->hash_mutex_mask + 1); i++) mutex_destroy(&h->hash_mutexes[i]); vmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *)); vmem_free(h->hash_mutexes, (h->hash_mutex_mask + 1) * sizeof (kmutex_t)); kmem_cache_destroy(dbuf_kmem_cache); taskq_destroy(dbu_evict_taskq); mutex_enter(&dbuf_evict_lock); dbuf_evict_thread_exit = B_TRUE; while (dbuf_evict_thread_exit) { cv_signal(&dbuf_evict_cv); cv_wait(&dbuf_evict_cv, &dbuf_evict_lock); } mutex_exit(&dbuf_evict_lock); mutex_destroy(&dbuf_evict_lock); cv_destroy(&dbuf_evict_cv); for (dbuf_cached_state_t dcs = 0; dcs < DB_CACHE_MAX; dcs++) { zfs_refcount_destroy(&dbuf_caches[dcs].size); multilist_destroy(&dbuf_caches[dcs].cache); } if (dbuf_ksp != NULL) { kstat_delete(dbuf_ksp); dbuf_ksp = NULL; } wmsum_fini(&dbuf_sums.cache_count); wmsum_fini(&dbuf_sums.cache_total_evicts); for (int i = 0; i < DN_MAX_LEVELS; i++) { wmsum_fini(&dbuf_sums.cache_levels[i]); wmsum_fini(&dbuf_sums.cache_levels_bytes[i]); } wmsum_fini(&dbuf_sums.hash_hits); wmsum_fini(&dbuf_sums.hash_misses); wmsum_fini(&dbuf_sums.hash_collisions); wmsum_fini(&dbuf_sums.hash_chains); wmsum_fini(&dbuf_sums.hash_insert_race); wmsum_fini(&dbuf_sums.metadata_cache_count); wmsum_fini(&dbuf_sums.metadata_cache_overflow); } /* * Other stuff. */ #ifdef ZFS_DEBUG static void dbuf_verify(dmu_buf_impl_t *db) { dnode_t *dn; dbuf_dirty_record_t *dr; uint32_t txg_prev; ASSERT(MUTEX_HELD(&db->db_mtx)); if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY)) return; ASSERT(db->db_objset != NULL); DB_DNODE_ENTER(db); dn = DB_DNODE(db); if (dn == NULL) { ASSERT(db->db_parent == NULL); ASSERT(db->db_blkptr == NULL); } else { ASSERT3U(db->db.db_object, ==, dn->dn_object); ASSERT3P(db->db_objset, ==, dn->dn_objset); ASSERT3U(db->db_level, <, dn->dn_nlevels); ASSERT(db->db_blkid == DMU_BONUS_BLKID || db->db_blkid == DMU_SPILL_BLKID || !avl_is_empty(&dn->dn_dbufs)); } if (db->db_blkid == DMU_BONUS_BLKID) { ASSERT(dn != NULL); ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID); } else if (db->db_blkid == DMU_SPILL_BLKID) { ASSERT(dn != NULL); ASSERT0(db->db.db_offset); } else { ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size); } if ((dr = list_head(&db->db_dirty_records)) != NULL) { ASSERT(dr->dr_dbuf == db); txg_prev = dr->dr_txg; for (dr = list_next(&db->db_dirty_records, dr); dr != NULL; dr = list_next(&db->db_dirty_records, dr)) { ASSERT(dr->dr_dbuf == db); ASSERT(txg_prev > dr->dr_txg); txg_prev = dr->dr_txg; } } /* * We can't assert that db_size matches dn_datablksz because it * can be momentarily different when another thread is doing * dnode_set_blksz(). */ if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) { dr = db->db_data_pending; /* * It should only be modified in syncing context, so * make sure we only have one copy of the data. */ ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf); } /* verify db->db_blkptr */ if (db->db_blkptr) { if (db->db_parent == dn->dn_dbuf) { /* db is pointed to by the dnode */ /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */ if (DMU_OBJECT_IS_SPECIAL(db->db.db_object)) ASSERT(db->db_parent == NULL); else ASSERT(db->db_parent != NULL); if (db->db_blkid != DMU_SPILL_BLKID) ASSERT3P(db->db_blkptr, ==, &dn->dn_phys->dn_blkptr[db->db_blkid]); } else { /* db is pointed to by an indirect block */ int epb __maybe_unused = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT; ASSERT3U(db->db_parent->db_level, ==, db->db_level+1); ASSERT3U(db->db_parent->db.db_object, ==, db->db.db_object); /* * dnode_grow_indblksz() can make this fail if we don't * have the parent's rwlock. XXX indblksz no longer * grows. safe to do this now? */ if (RW_LOCK_HELD(&db->db_parent->db_rwlock)) { ASSERT3P(db->db_blkptr, ==, ((blkptr_t *)db->db_parent->db.db_data + db->db_blkid % epb)); } } } if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) && (db->db_buf == NULL || db->db_buf->b_data) && db->db.db_data && db->db_blkid != DMU_BONUS_BLKID && db->db_state != DB_FILL && !dn->dn_free_txg) { /* * If the blkptr isn't set but they have nonzero data, * it had better be dirty, otherwise we'll lose that * data when we evict this buffer. * * There is an exception to this rule for indirect blocks; in * this case, if the indirect block is a hole, we fill in a few * fields on each of the child blocks (importantly, birth time) * to prevent hole birth times from being lost when you * partially fill in a hole. */ if (db->db_dirtycnt == 0) { if (db->db_level == 0) { uint64_t *buf = db->db.db_data; int i; for (i = 0; i < db->db.db_size >> 3; i++) { ASSERT(buf[i] == 0); } } else { blkptr_t *bps = db->db.db_data; ASSERT3U(1 << DB_DNODE(db)->dn_indblkshift, ==, db->db.db_size); /* * We want to verify that all the blkptrs in the * indirect block are holes, but we may have * automatically set up a few fields for them. * We iterate through each blkptr and verify * they only have those fields set. */ for (int i = 0; i < db->db.db_size / sizeof (blkptr_t); i++) { blkptr_t *bp = &bps[i]; ASSERT(ZIO_CHECKSUM_IS_ZERO( &bp->blk_cksum)); ASSERT( DVA_IS_EMPTY(&bp->blk_dva[0]) && DVA_IS_EMPTY(&bp->blk_dva[1]) && DVA_IS_EMPTY(&bp->blk_dva[2])); ASSERT0(bp->blk_fill); ASSERT0(bp->blk_pad[0]); ASSERT0(bp->blk_pad[1]); ASSERT(!BP_IS_EMBEDDED(bp)); ASSERT(BP_IS_HOLE(bp)); ASSERT0(bp->blk_phys_birth); } } } } DB_DNODE_EXIT(db); } #endif static void dbuf_clear_data(dmu_buf_impl_t *db) { ASSERT(MUTEX_HELD(&db->db_mtx)); dbuf_evict_user(db); ASSERT3P(db->db_buf, ==, NULL); db->db.db_data = NULL; if (db->db_state != DB_NOFILL) { db->db_state = DB_UNCACHED; DTRACE_SET_STATE(db, "clear data"); } } static void dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf) { ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(buf != NULL); db->db_buf = buf; ASSERT(buf->b_data != NULL); db->db.db_data = buf->b_data; } static arc_buf_t * dbuf_alloc_arcbuf(dmu_buf_impl_t *db) { spa_t *spa = db->db_objset->os_spa; return (arc_alloc_buf(spa, db, DBUF_GET_BUFC_TYPE(db), db->db.db_size)); } /* * Loan out an arc_buf for read. Return the loaned arc_buf. */ arc_buf_t * dbuf_loan_arcbuf(dmu_buf_impl_t *db) { arc_buf_t *abuf; ASSERT(db->db_blkid != DMU_BONUS_BLKID); mutex_enter(&db->db_mtx); if (arc_released(db->db_buf) || zfs_refcount_count(&db->db_holds) > 1) { int blksz = db->db.db_size; spa_t *spa = db->db_objset->os_spa; mutex_exit(&db->db_mtx); abuf = arc_loan_buf(spa, B_FALSE, blksz); memcpy(abuf->b_data, db->db.db_data, blksz); } else { abuf = db->db_buf; arc_loan_inuse_buf(abuf, db); db->db_buf = NULL; dbuf_clear_data(db); mutex_exit(&db->db_mtx); } return (abuf); } /* * Calculate which level n block references the data at the level 0 offset * provided. */ uint64_t dbuf_whichblock(const dnode_t *dn, const int64_t level, const uint64_t offset) { if (dn->dn_datablkshift != 0 && dn->dn_indblkshift != 0) { /* * The level n blkid is equal to the level 0 blkid divided by * the number of level 0s in a level n block. * * The level 0 blkid is offset >> datablkshift = * offset / 2^datablkshift. * * The number of level 0s in a level n is the number of block * pointers in an indirect block, raised to the power of level. * This is 2^(indblkshift - SPA_BLKPTRSHIFT)^level = * 2^(level*(indblkshift - SPA_BLKPTRSHIFT)). * * Thus, the level n blkid is: offset / * ((2^datablkshift)*(2^(level*(indblkshift-SPA_BLKPTRSHIFT)))) * = offset / 2^(datablkshift + level * * (indblkshift - SPA_BLKPTRSHIFT)) * = offset >> (datablkshift + level * * (indblkshift - SPA_BLKPTRSHIFT)) */ const unsigned exp = dn->dn_datablkshift + level * (dn->dn_indblkshift - SPA_BLKPTRSHIFT); if (exp >= 8 * sizeof (offset)) { /* This only happens on the highest indirection level */ ASSERT3U(level, ==, dn->dn_nlevels - 1); return (0); } ASSERT3U(exp, <, 8 * sizeof (offset)); return (offset >> exp); } else { ASSERT3U(offset, <, dn->dn_datablksz); return (0); } } /* * This function is used to lock the parent of the provided dbuf. This should be * used when modifying or reading db_blkptr. */ db_lock_type_t dmu_buf_lock_parent(dmu_buf_impl_t *db, krw_t rw, const void *tag) { enum db_lock_type ret = DLT_NONE; if (db->db_parent != NULL) { rw_enter(&db->db_parent->db_rwlock, rw); ret = DLT_PARENT; } else if (dmu_objset_ds(db->db_objset) != NULL) { rrw_enter(&dmu_objset_ds(db->db_objset)->ds_bp_rwlock, rw, tag); ret = DLT_OBJSET; } /* * We only return a DLT_NONE lock when it's the top-most indirect block * of the meta-dnode of the MOS. */ return (ret); } /* * We need to pass the lock type in because it's possible that the block will * move from being the topmost indirect block in a dnode (and thus, have no * parent) to not the top-most via an indirection increase. This would cause a * panic if we didn't pass the lock type in. */ void dmu_buf_unlock_parent(dmu_buf_impl_t *db, db_lock_type_t type, const void *tag) { if (type == DLT_PARENT) rw_exit(&db->db_parent->db_rwlock); else if (type == DLT_OBJSET) rrw_exit(&dmu_objset_ds(db->db_objset)->ds_bp_rwlock, tag); } static void dbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, arc_buf_t *buf, void *vdb) { (void) zb, (void) bp; dmu_buf_impl_t *db = vdb; mutex_enter(&db->db_mtx); ASSERT3U(db->db_state, ==, DB_READ); /* * All reads are synchronous, so we must have a hold on the dbuf */ ASSERT(zfs_refcount_count(&db->db_holds) > 0); ASSERT(db->db_buf == NULL); ASSERT(db->db.db_data == NULL); if (buf == NULL) { /* i/o error */ ASSERT(zio == NULL || zio->io_error != 0); ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT3P(db->db_buf, ==, NULL); db->db_state = DB_UNCACHED; DTRACE_SET_STATE(db, "i/o error"); } else if (db->db_level == 0 && db->db_freed_in_flight) { /* freed in flight */ ASSERT(zio == NULL || zio->io_error == 0); arc_release(buf, db); memset(buf->b_data, 0, db->db.db_size); arc_buf_freeze(buf); db->db_freed_in_flight = FALSE; dbuf_set_data(db, buf); db->db_state = DB_CACHED; DTRACE_SET_STATE(db, "freed in flight"); } else { /* success */ ASSERT(zio == NULL || zio->io_error == 0); dbuf_set_data(db, buf); db->db_state = DB_CACHED; DTRACE_SET_STATE(db, "successful read"); } cv_broadcast(&db->db_changed); dbuf_rele_and_unlock(db, NULL, B_FALSE); } /* * Shortcut for performing reads on bonus dbufs. Returns * an error if we fail to verify the dnode associated with * a decrypted block. Otherwise success. */ static int dbuf_read_bonus(dmu_buf_impl_t *db, dnode_t *dn, uint32_t flags) { int bonuslen, max_bonuslen, err; err = dbuf_read_verify_dnode_crypt(db, flags); if (err) return (err); bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen); max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots); ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(DB_DNODE_HELD(db)); ASSERT3U(bonuslen, <=, db->db.db_size); db->db.db_data = kmem_alloc(max_bonuslen, KM_SLEEP); arc_space_consume(max_bonuslen, ARC_SPACE_BONUS); if (bonuslen < max_bonuslen) memset(db->db.db_data, 0, max_bonuslen); if (bonuslen) memcpy(db->db.db_data, DN_BONUS(dn->dn_phys), bonuslen); db->db_state = DB_CACHED; DTRACE_SET_STATE(db, "bonus buffer filled"); return (0); } static void dbuf_handle_indirect_hole(dmu_buf_impl_t *db, dnode_t *dn) { blkptr_t *bps = db->db.db_data; uint32_t indbs = 1ULL << dn->dn_indblkshift; int n_bps = indbs >> SPA_BLKPTRSHIFT; for (int i = 0; i < n_bps; i++) { blkptr_t *bp = &bps[i]; ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, indbs); BP_SET_LSIZE(bp, BP_GET_LEVEL(db->db_blkptr) == 1 ? dn->dn_datablksz : BP_GET_LSIZE(db->db_blkptr)); BP_SET_TYPE(bp, BP_GET_TYPE(db->db_blkptr)); BP_SET_LEVEL(bp, BP_GET_LEVEL(db->db_blkptr) - 1); BP_SET_BIRTH(bp, db->db_blkptr->blk_birth, 0); } } /* * Handle reads on dbufs that are holes, if necessary. This function * requires that the dbuf's mutex is held. Returns success (0) if action * was taken, ENOENT if no action was taken. */ static int dbuf_read_hole(dmu_buf_impl_t *db, dnode_t *dn) { ASSERT(MUTEX_HELD(&db->db_mtx)); int is_hole = db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr); /* * For level 0 blocks only, if the above check fails: * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync() * processes the delete record and clears the bp while we are waiting * for the dn_mtx (resulting in a "no" from block_freed). */ if (!is_hole && db->db_level == 0) { is_hole = dnode_block_freed(dn, db->db_blkid) || BP_IS_HOLE(db->db_blkptr); } if (is_hole) { dbuf_set_data(db, dbuf_alloc_arcbuf(db)); memset(db->db.db_data, 0, db->db.db_size); if (db->db_blkptr != NULL && db->db_level > 0 && BP_IS_HOLE(db->db_blkptr) && db->db_blkptr->blk_birth != 0) { dbuf_handle_indirect_hole(db, dn); } db->db_state = DB_CACHED; DTRACE_SET_STATE(db, "hole read satisfied"); return (0); } return (ENOENT); } /* * This function ensures that, when doing a decrypting read of a block, * we make sure we have decrypted the dnode associated with it. We must do * this so that we ensure we are fully authenticating the checksum-of-MACs * tree from the root of the objset down to this block. Indirect blocks are * always verified against their secure checksum-of-MACs assuming that the * dnode containing them is correct. Now that we are doing a decrypting read, * we can be sure that the key is loaded and verify that assumption. This is * especially important considering that we always read encrypted dnode * blocks as raw data (without verifying their MACs) to start, and * decrypt / authenticate them when we need to read an encrypted bonus buffer. */ static int dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, uint32_t flags) { int err = 0; objset_t *os = db->db_objset; arc_buf_t *dnode_abuf; dnode_t *dn; zbookmark_phys_t zb; ASSERT(MUTEX_HELD(&db->db_mtx)); if (!os->os_encrypted || os->os_raw_receive || (flags & DB_RF_NO_DECRYPT) != 0) return (0); DB_DNODE_ENTER(db); dn = DB_DNODE(db); dnode_abuf = (dn->dn_dbuf != NULL) ? dn->dn_dbuf->db_buf : NULL; if (dnode_abuf == NULL || !arc_is_encrypted(dnode_abuf)) { DB_DNODE_EXIT(db); return (0); } SET_BOOKMARK(&zb, dmu_objset_id(os), DMU_META_DNODE_OBJECT, 0, dn->dn_dbuf->db_blkid); err = arc_untransform(dnode_abuf, os->os_spa, &zb, B_TRUE); /* * An error code of EACCES tells us that the key is still not * available. This is ok if we are only reading authenticated * (and therefore non-encrypted) blocks. */ if (err == EACCES && ((db->db_blkid != DMU_BONUS_BLKID && !DMU_OT_IS_ENCRYPTED(dn->dn_type)) || (db->db_blkid == DMU_BONUS_BLKID && !DMU_OT_IS_ENCRYPTED(dn->dn_bonustype)))) err = 0; DB_DNODE_EXIT(db); return (err); } /* * Drops db_mtx and the parent lock specified by dblt and tag before * returning. */ static int dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags, db_lock_type_t dblt, const void *tag) { dnode_t *dn; zbookmark_phys_t zb; uint32_t aflags = ARC_FLAG_NOWAIT; int err, zio_flags; - err = zio_flags = 0; DB_DNODE_ENTER(db); dn = DB_DNODE(db); ASSERT(!zfs_refcount_is_zero(&db->db_holds)); ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(db->db_state == DB_UNCACHED); ASSERT(db->db_buf == NULL); ASSERT(db->db_parent == NULL || RW_LOCK_HELD(&db->db_parent->db_rwlock)); if (db->db_blkid == DMU_BONUS_BLKID) { err = dbuf_read_bonus(db, dn, flags); goto early_unlock; } err = dbuf_read_hole(db, dn); if (err == 0) goto early_unlock; /* * Any attempt to read a redacted block should result in an error. This * will never happen under normal conditions, but can be useful for * debugging purposes. */ if (BP_IS_REDACTED(db->db_blkptr)) { ASSERT(dsl_dataset_feature_is_active( db->db_objset->os_dsl_dataset, SPA_FEATURE_REDACTED_DATASETS)); err = SET_ERROR(EIO); goto early_unlock; } SET_BOOKMARK(&zb, dmu_objset_id(db->db_objset), db->db.db_object, db->db_level, db->db_blkid); /* * All bps of an encrypted os should have the encryption bit set. * If this is not true it indicates tampering and we report an error. */ if (db->db_objset->os_encrypted && !BP_USES_CRYPT(db->db_blkptr)) { spa_log_error(db->db_objset->os_spa, &zb); zfs_panic_recover("unencrypted block in encrypted " "object set %llu", dmu_objset_id(db->db_objset)); err = SET_ERROR(EIO); goto early_unlock; } err = dbuf_read_verify_dnode_crypt(db, flags); if (err != 0) goto early_unlock; DB_DNODE_EXIT(db); db->db_state = DB_READ; DTRACE_SET_STATE(db, "read issued"); mutex_exit(&db->db_mtx); if (dbuf_is_l2cacheable(db)) aflags |= ARC_FLAG_L2CACHE; dbuf_add_ref(db, NULL); zio_flags = (flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED; if ((flags & DB_RF_NO_DECRYPT) && BP_IS_PROTECTED(db->db_blkptr)) zio_flags |= ZIO_FLAG_RAW; /* * The zio layer will copy the provided blkptr later, but we need to * do this now so that we can release the parent's rwlock. We have to * do that now so that if dbuf_read_done is called synchronously (on * an l1 cache hit) we don't acquire the db_mtx while holding the * parent's rwlock, which would be a lock ordering violation. */ blkptr_t bp = *db->db_blkptr; dmu_buf_unlock_parent(db, dblt, tag); (void) arc_read(zio, db->db_objset->os_spa, &bp, dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb); return (err); early_unlock: DB_DNODE_EXIT(db); mutex_exit(&db->db_mtx); dmu_buf_unlock_parent(db, dblt, tag); return (err); } /* * This is our just-in-time copy function. It makes a copy of buffers that * have been modified in a previous transaction group before we access them in * the current active group. * * This function is used in three places: when we are dirtying a buffer for the * first time in a txg, when we are freeing a range in a dnode that includes * this buffer, and when we are accessing a buffer which was received compressed * and later referenced in a WRITE_BYREF record. * * Note that when we are called from dbuf_free_range() we do not put a hold on * the buffer, we just traverse the active dbuf list for the dnode. */ static void dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) { dbuf_dirty_record_t *dr = list_head(&db->db_dirty_records); ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(db->db.db_data != NULL); ASSERT(db->db_level == 0); ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT); if (dr == NULL || (dr->dt.dl.dr_data != ((db->db_blkid == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf))) return; /* * If the last dirty record for this dbuf has not yet synced * and its referencing the dbuf data, either: * reset the reference to point to a new copy, * or (if there a no active holders) * just null out the current db_data pointer. */ ASSERT3U(dr->dr_txg, >=, txg - 2); if (db->db_blkid == DMU_BONUS_BLKID) { dnode_t *dn = DB_DNODE(db); int bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots); dr->dt.dl.dr_data = kmem_alloc(bonuslen, KM_SLEEP); arc_space_consume(bonuslen, ARC_SPACE_BONUS); memcpy(dr->dt.dl.dr_data, db->db.db_data, bonuslen); } else if (zfs_refcount_count(&db->db_holds) > db->db_dirtycnt) { dnode_t *dn = DB_DNODE(db); int size = arc_buf_size(db->db_buf); arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); spa_t *spa = db->db_objset->os_spa; enum zio_compress compress_type = arc_get_compression(db->db_buf); uint8_t complevel = arc_get_complevel(db->db_buf); if (arc_is_encrypted(db->db_buf)) { boolean_t byteorder; uint8_t salt[ZIO_DATA_SALT_LEN]; uint8_t iv[ZIO_DATA_IV_LEN]; uint8_t mac[ZIO_DATA_MAC_LEN]; arc_get_raw_params(db->db_buf, &byteorder, salt, iv, mac); dr->dt.dl.dr_data = arc_alloc_raw_buf(spa, db, dmu_objset_id(dn->dn_objset), byteorder, salt, iv, mac, dn->dn_type, size, arc_buf_lsize(db->db_buf), compress_type, complevel); } else if (compress_type != ZIO_COMPRESS_OFF) { ASSERT3U(type, ==, ARC_BUFC_DATA); dr->dt.dl.dr_data = arc_alloc_compressed_buf(spa, db, size, arc_buf_lsize(db->db_buf), compress_type, complevel); } else { dr->dt.dl.dr_data = arc_alloc_buf(spa, db, type, size); } memcpy(dr->dt.dl.dr_data->b_data, db->db.db_data, size); } else { db->db_buf = NULL; dbuf_clear_data(db); } } int dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) { int err = 0; boolean_t prefetch; dnode_t *dn; /* * We don't have to hold the mutex to check db_state because it * can't be freed while we have a hold on the buffer. */ ASSERT(!zfs_refcount_is_zero(&db->db_holds)); if (db->db_state == DB_NOFILL) return (SET_ERROR(EIO)); DB_DNODE_ENTER(db); dn = DB_DNODE(db); prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL && DBUF_IS_CACHEABLE(db); mutex_enter(&db->db_mtx); if (db->db_state == DB_CACHED) { spa_t *spa = dn->dn_objset->os_spa; /* * Ensure that this block's dnode has been decrypted if * the caller has requested decrypted data. */ err = dbuf_read_verify_dnode_crypt(db, flags); /* * If the arc buf is compressed or encrypted and the caller * requested uncompressed data, we need to untransform it * before returning. We also call arc_untransform() on any * unauthenticated blocks, which will verify their MAC if * the key is now available. */ if (err == 0 && db->db_buf != NULL && (flags & DB_RF_NO_DECRYPT) == 0 && (arc_is_encrypted(db->db_buf) || arc_is_unauthenticated(db->db_buf) || arc_get_compression(db->db_buf) != ZIO_COMPRESS_OFF)) { zbookmark_phys_t zb; SET_BOOKMARK(&zb, dmu_objset_id(db->db_objset), db->db.db_object, db->db_level, db->db_blkid); dbuf_fix_old_data(db, spa_syncing_txg(spa)); err = arc_untransform(db->db_buf, spa, &zb, B_FALSE); dbuf_set_data(db, db->db_buf); } mutex_exit(&db->db_mtx); if (err == 0 && prefetch) { dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE, B_FALSE, flags & DB_RF_HAVESTRUCT); } DB_DNODE_EXIT(db); DBUF_STAT_BUMP(hash_hits); } else if (db->db_state == DB_UNCACHED) { spa_t *spa = dn->dn_objset->os_spa; boolean_t need_wait = B_FALSE; db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG); if (zio == NULL && db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) { zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); need_wait = B_TRUE; } err = dbuf_read_impl(db, zio, flags, dblt, FTAG); /* * dbuf_read_impl has dropped db_mtx and our parent's rwlock * for us */ if (!err && prefetch) { dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE, db->db_state != DB_CACHED, flags & DB_RF_HAVESTRUCT); } DB_DNODE_EXIT(db); DBUF_STAT_BUMP(hash_misses); /* * If we created a zio_root we must execute it to avoid * leaking it, even if it isn't attached to any work due * to an error in dbuf_read_impl(). */ if (need_wait) { if (err == 0) err = zio_wait(zio); else VERIFY0(zio_wait(zio)); } } else { /* * Another reader came in while the dbuf was in flight * between UNCACHED and CACHED. Either a writer will finish * writing the buffer (sending the dbuf to CACHED) or the * first reader's request will reach the read_done callback * and send the dbuf to CACHED. Otherwise, a failure * occurred and the dbuf went to UNCACHED. */ mutex_exit(&db->db_mtx); if (prefetch) { dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE, B_TRUE, flags & DB_RF_HAVESTRUCT); } DB_DNODE_EXIT(db); DBUF_STAT_BUMP(hash_misses); /* Skip the wait per the caller's request. */ if ((flags & DB_RF_NEVERWAIT) == 0) { mutex_enter(&db->db_mtx); while (db->db_state == DB_READ || db->db_state == DB_FILL) { ASSERT(db->db_state == DB_READ || (flags & DB_RF_HAVESTRUCT) == 0); DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *, db, zio_t *, zio); cv_wait(&db->db_changed, &db->db_mtx); } if (db->db_state == DB_UNCACHED) err = SET_ERROR(EIO); mutex_exit(&db->db_mtx); } } return (err); } static void dbuf_noread(dmu_buf_impl_t *db) { ASSERT(!zfs_refcount_is_zero(&db->db_holds)); ASSERT(db->db_blkid != DMU_BONUS_BLKID); mutex_enter(&db->db_mtx); while (db->db_state == DB_READ || db->db_state == DB_FILL) cv_wait(&db->db_changed, &db->db_mtx); if (db->db_state == DB_UNCACHED) { ASSERT(db->db_buf == NULL); ASSERT(db->db.db_data == NULL); dbuf_set_data(db, dbuf_alloc_arcbuf(db)); db->db_state = DB_FILL; DTRACE_SET_STATE(db, "assigning filled buffer"); } else if (db->db_state == DB_NOFILL) { dbuf_clear_data(db); } else { ASSERT3U(db->db_state, ==, DB_CACHED); } mutex_exit(&db->db_mtx); } void dbuf_unoverride(dbuf_dirty_record_t *dr) { dmu_buf_impl_t *db = dr->dr_dbuf; blkptr_t *bp = &dr->dt.dl.dr_overridden_by; uint64_t txg = dr->dr_txg; ASSERT(MUTEX_HELD(&db->db_mtx)); /* * This assert is valid because dmu_sync() expects to be called by * a zilog's get_data while holding a range lock. This call only * comes from dbuf_dirty() callers who must also hold a range lock. */ ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC); ASSERT(db->db_level == 0); if (db->db_blkid == DMU_BONUS_BLKID || dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN) return; ASSERT(db->db_data_pending != dr); /* free this block */ if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite) zio_free(db->db_objset->os_spa, txg, bp); dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; dr->dt.dl.dr_nopwrite = B_FALSE; dr->dt.dl.dr_has_raw_params = B_FALSE; /* * Release the already-written buffer, so we leave it in * a consistent dirty state. Note that all callers are * modifying the buffer, so they will immediately do * another (redundant) arc_release(). Therefore, leave * the buf thawed to save the effort of freezing & * immediately re-thawing it. */ arc_release(dr->dt.dl.dr_data, db); } /* * Evict (if its unreferenced) or clear (if its referenced) any level-0 * data blocks in the free range, so that any future readers will find * empty blocks. */ void dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid, dmu_tx_t *tx) { dmu_buf_impl_t *db_search; dmu_buf_impl_t *db, *db_next; uint64_t txg = tx->tx_txg; avl_index_t where; dbuf_dirty_record_t *dr; if (end_blkid > dn->dn_maxblkid && !(start_blkid == DMU_SPILL_BLKID || end_blkid == DMU_SPILL_BLKID)) end_blkid = dn->dn_maxblkid; dprintf_dnode(dn, "start=%llu end=%llu\n", (u_longlong_t)start_blkid, (u_longlong_t)end_blkid); db_search = kmem_alloc(sizeof (dmu_buf_impl_t), KM_SLEEP); db_search->db_level = 0; db_search->db_blkid = start_blkid; db_search->db_state = DB_SEARCH; mutex_enter(&dn->dn_dbufs_mtx); db = avl_find(&dn->dn_dbufs, db_search, &where); ASSERT3P(db, ==, NULL); db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER); for (; db != NULL; db = db_next) { db_next = AVL_NEXT(&dn->dn_dbufs, db); ASSERT(db->db_blkid != DMU_BONUS_BLKID); if (db->db_level != 0 || db->db_blkid > end_blkid) { break; } ASSERT3U(db->db_blkid, >=, start_blkid); /* found a level 0 buffer in the range */ mutex_enter(&db->db_mtx); if (dbuf_undirty(db, tx)) { /* mutex has been dropped and dbuf destroyed */ continue; } if (db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL || db->db_state == DB_EVICTING) { ASSERT(db->db.db_data == NULL); mutex_exit(&db->db_mtx); continue; } if (db->db_state == DB_READ || db->db_state == DB_FILL) { /* will be handled in dbuf_read_done or dbuf_rele */ db->db_freed_in_flight = TRUE; mutex_exit(&db->db_mtx); continue; } if (zfs_refcount_count(&db->db_holds) == 0) { ASSERT(db->db_buf); dbuf_destroy(db); continue; } /* The dbuf is referenced */ dr = list_head(&db->db_dirty_records); if (dr != NULL) { if (dr->dr_txg == txg) { /* * This buffer is "in-use", re-adjust the file * size to reflect that this buffer may * contain new data when we sync. */ if (db->db_blkid != DMU_SPILL_BLKID && db->db_blkid > dn->dn_maxblkid) dn->dn_maxblkid = db->db_blkid; dbuf_unoverride(dr); } else { /* * This dbuf is not dirty in the open context. * Either uncache it (if its not referenced in * the open context) or reset its contents to * empty. */ dbuf_fix_old_data(db, txg); } } /* clear the contents if its cached */ if (db->db_state == DB_CACHED) { ASSERT(db->db.db_data != NULL); arc_release(db->db_buf, db); rw_enter(&db->db_rwlock, RW_WRITER); memset(db->db.db_data, 0, db->db.db_size); rw_exit(&db->db_rwlock); arc_buf_freeze(db->db_buf); } mutex_exit(&db->db_mtx); } mutex_exit(&dn->dn_dbufs_mtx); kmem_free(db_search, sizeof (dmu_buf_impl_t)); } void dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) { arc_buf_t *buf, *old_buf; dbuf_dirty_record_t *dr; int osize = db->db.db_size; arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); dnode_t *dn; ASSERT(db->db_blkid != DMU_BONUS_BLKID); DB_DNODE_ENTER(db); dn = DB_DNODE(db); /* * XXX we should be doing a dbuf_read, checking the return * value and returning that up to our callers */ dmu_buf_will_dirty(&db->db, tx); /* create the data buffer for the new block */ buf = arc_alloc_buf(dn->dn_objset->os_spa, db, type, size); /* copy old block data to the new block */ old_buf = db->db_buf; memcpy(buf->b_data, old_buf->b_data, MIN(osize, size)); /* zero the remainder */ if (size > osize) memset((uint8_t *)buf->b_data + osize, 0, size - osize); mutex_enter(&db->db_mtx); dbuf_set_data(db, buf); arc_buf_destroy(old_buf, db); db->db.db_size = size; dr = list_head(&db->db_dirty_records); /* dirty record added by dmu_buf_will_dirty() */ VERIFY(dr != NULL); if (db->db_level == 0) dr->dt.dl.dr_data = buf; ASSERT3U(dr->dr_txg, ==, tx->tx_txg); ASSERT3U(dr->dr_accounted, ==, osize); dr->dr_accounted = size; mutex_exit(&db->db_mtx); dmu_objset_willuse_space(dn->dn_objset, size - osize, tx); DB_DNODE_EXIT(db); } void dbuf_release_bp(dmu_buf_impl_t *db) { objset_t *os __maybe_unused = db->db_objset; ASSERT(dsl_pool_sync_context(dmu_objset_pool(os))); ASSERT(arc_released(os->os_phys_buf) || list_link_active(&os->os_dsl_dataset->ds_synced_link)); ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf)); (void) arc_release(db->db_buf, db); } /* * We already have a dirty record for this TXG, and we are being * dirtied again. */ static void dbuf_redirty(dbuf_dirty_record_t *dr) { dmu_buf_impl_t *db = dr->dr_dbuf; ASSERT(MUTEX_HELD(&db->db_mtx)); if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) { /* * If this buffer has already been written out, * we now need to reset its state. */ dbuf_unoverride(dr); if (db->db.db_object != DMU_META_DNODE_OBJECT && db->db_state != DB_NOFILL) { /* Already released on initial dirty, so just thaw. */ ASSERT(arc_released(db->db_buf)); arc_buf_thaw(db->db_buf); } } } dbuf_dirty_record_t * dbuf_dirty_lightweight(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx) { rw_enter(&dn->dn_struct_rwlock, RW_READER); IMPLY(dn->dn_objset->os_raw_receive, dn->dn_maxblkid >= blkid); dnode_new_blkid(dn, blkid, tx, B_TRUE, B_FALSE); ASSERT(dn->dn_maxblkid >= blkid); dbuf_dirty_record_t *dr = kmem_zalloc(sizeof (*dr), KM_SLEEP); list_link_init(&dr->dr_dirty_node); list_link_init(&dr->dr_dbuf_node); dr->dr_dnode = dn; dr->dr_txg = tx->tx_txg; dr->dt.dll.dr_blkid = blkid; dr->dr_accounted = dn->dn_datablksz; /* * There should not be any dbuf for the block that we're dirtying. * Otherwise the buffer contents could be inconsistent between the * dbuf and the lightweight dirty record. */ ASSERT3P(NULL, ==, dbuf_find(dn->dn_objset, dn->dn_object, 0, blkid)); mutex_enter(&dn->dn_mtx); int txgoff = tx->tx_txg & TXG_MASK; if (dn->dn_free_ranges[txgoff] != NULL) { range_tree_clear(dn->dn_free_ranges[txgoff], blkid, 1); } if (dn->dn_nlevels == 1) { ASSERT3U(blkid, <, dn->dn_nblkptr); list_insert_tail(&dn->dn_dirty_records[txgoff], dr); mutex_exit(&dn->dn_mtx); rw_exit(&dn->dn_struct_rwlock); dnode_setdirty(dn, tx); } else { mutex_exit(&dn->dn_mtx); int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; dmu_buf_impl_t *parent_db = dbuf_hold_level(dn, 1, blkid >> epbs, FTAG); rw_exit(&dn->dn_struct_rwlock); if (parent_db == NULL) { kmem_free(dr, sizeof (*dr)); return (NULL); } int err = dbuf_read(parent_db, NULL, (DB_RF_NOPREFETCH | DB_RF_CANFAIL)); if (err != 0) { dbuf_rele(parent_db, FTAG); kmem_free(dr, sizeof (*dr)); return (NULL); } dbuf_dirty_record_t *parent_dr = dbuf_dirty(parent_db, tx); dbuf_rele(parent_db, FTAG); mutex_enter(&parent_dr->dt.di.dr_mtx); ASSERT3U(parent_dr->dr_txg, ==, tx->tx_txg); list_insert_tail(&parent_dr->dt.di.dr_children, dr); mutex_exit(&parent_dr->dt.di.dr_mtx); dr->dr_parent = parent_dr; } dmu_objset_willuse_space(dn->dn_objset, dr->dr_accounted, tx); return (dr); } dbuf_dirty_record_t * dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) { dnode_t *dn; objset_t *os; dbuf_dirty_record_t *dr, *dr_next, *dr_head; int txgoff = tx->tx_txg & TXG_MASK; boolean_t drop_struct_rwlock = B_FALSE; ASSERT(tx->tx_txg != 0); ASSERT(!zfs_refcount_is_zero(&db->db_holds)); DMU_TX_DIRTY_BUF(tx, db); DB_DNODE_ENTER(db); dn = DB_DNODE(db); /* * Shouldn't dirty a regular buffer in syncing context. Private * objects may be dirtied in syncing context, but only if they * were already pre-dirtied in open context. */ #ifdef ZFS_DEBUG if (dn->dn_objset->os_dsl_dataset != NULL) { rrw_enter(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock, RW_READER, FTAG); } ASSERT(!dmu_tx_is_syncing(tx) || BP_IS_HOLE(dn->dn_objset->os_rootbp) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) || dn->dn_objset->os_dsl_dataset == NULL); if (dn->dn_objset->os_dsl_dataset != NULL) rrw_exit(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock, FTAG); #endif /* * We make this assert for private objects as well, but after we * check if we're already dirty. They are allowed to re-dirty * in syncing context. */ ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); mutex_enter(&db->db_mtx); /* * XXX make this true for indirects too? The problem is that * transactions created with dmu_tx_create_assigned() from * syncing context don't bother holding ahead. */ ASSERT(db->db_level != 0 || db->db_state == DB_CACHED || db->db_state == DB_FILL || db->db_state == DB_NOFILL); mutex_enter(&dn->dn_mtx); dnode_set_dirtyctx(dn, tx, db); if (tx->tx_txg > dn->dn_dirty_txg) dn->dn_dirty_txg = tx->tx_txg; mutex_exit(&dn->dn_mtx); if (db->db_blkid == DMU_SPILL_BLKID) dn->dn_have_spill = B_TRUE; /* * If this buffer is already dirty, we're done. */ dr_head = list_head(&db->db_dirty_records); ASSERT(dr_head == NULL || dr_head->dr_txg <= tx->tx_txg || db->db.db_object == DMU_META_DNODE_OBJECT); dr_next = dbuf_find_dirty_lte(db, tx->tx_txg); if (dr_next && dr_next->dr_txg == tx->tx_txg) { DB_DNODE_EXIT(db); dbuf_redirty(dr_next); mutex_exit(&db->db_mtx); return (dr_next); } /* * Only valid if not already dirty. */ ASSERT(dn->dn_object == 0 || dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); ASSERT3U(dn->dn_nlevels, >, db->db_level); /* * We should only be dirtying in syncing context if it's the * mos or we're initializing the os or it's a special object. * However, we are allowed to dirty in syncing context provided * we already dirtied it in open context. Hence we must make * this assertion only if we're not already dirty. */ os = dn->dn_objset; VERIFY3U(tx->tx_txg, <=, spa_final_dirty_txg(os->os_spa)); #ifdef ZFS_DEBUG if (dn->dn_objset->os_dsl_dataset != NULL) rrw_enter(&os->os_dsl_dataset->ds_bp_rwlock, RW_READER, FTAG); ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) || os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp)); if (dn->dn_objset->os_dsl_dataset != NULL) rrw_exit(&os->os_dsl_dataset->ds_bp_rwlock, FTAG); #endif ASSERT(db->db.db_size != 0); dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); if (db->db_blkid != DMU_BONUS_BLKID) { dmu_objset_willuse_space(os, db->db.db_size, tx); } /* * If this buffer is dirty in an old transaction group we need * to make a copy of it so that the changes we make in this * transaction group won't leak out when we sync the older txg. */ dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP); list_link_init(&dr->dr_dirty_node); list_link_init(&dr->dr_dbuf_node); dr->dr_dnode = dn; if (db->db_level == 0) { void *data_old = db->db_buf; if (db->db_state != DB_NOFILL) { if (db->db_blkid == DMU_BONUS_BLKID) { dbuf_fix_old_data(db, tx->tx_txg); data_old = db->db.db_data; } else if (db->db.db_object != DMU_META_DNODE_OBJECT) { /* * Release the data buffer from the cache so * that we can modify it without impacting * possible other users of this cached data * block. Note that indirect blocks and * private objects are not released until the * syncing state (since they are only modified * then). */ arc_release(db->db_buf, db); dbuf_fix_old_data(db, tx->tx_txg); data_old = db->db_buf; } ASSERT(data_old != NULL); } dr->dt.dl.dr_data = data_old; } else { mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_NOLOCKDEP, NULL); list_create(&dr->dt.di.dr_children, sizeof (dbuf_dirty_record_t), offsetof(dbuf_dirty_record_t, dr_dirty_node)); } if (db->db_blkid != DMU_BONUS_BLKID) dr->dr_accounted = db->db.db_size; dr->dr_dbuf = db; dr->dr_txg = tx->tx_txg; list_insert_before(&db->db_dirty_records, dr_next, dr); /* * We could have been freed_in_flight between the dbuf_noread * and dbuf_dirty. We win, as though the dbuf_noread() had * happened after the free. */ if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && db->db_blkid != DMU_SPILL_BLKID) { mutex_enter(&dn->dn_mtx); if (dn->dn_free_ranges[txgoff] != NULL) { range_tree_clear(dn->dn_free_ranges[txgoff], db->db_blkid, 1); } mutex_exit(&dn->dn_mtx); db->db_freed_in_flight = FALSE; } /* * This buffer is now part of this txg */ dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg); db->db_dirtycnt += 1; ASSERT3U(db->db_dirtycnt, <=, 3); mutex_exit(&db->db_mtx); if (db->db_blkid == DMU_BONUS_BLKID || db->db_blkid == DMU_SPILL_BLKID) { mutex_enter(&dn->dn_mtx); ASSERT(!list_link_active(&dr->dr_dirty_node)); list_insert_tail(&dn->dn_dirty_records[txgoff], dr); mutex_exit(&dn->dn_mtx); dnode_setdirty(dn, tx); DB_DNODE_EXIT(db); return (dr); } if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) { rw_enter(&dn->dn_struct_rwlock, RW_READER); drop_struct_rwlock = B_TRUE; } /* * If we are overwriting a dedup BP, then unless it is snapshotted, * when we get to syncing context we will need to decrement its * refcount in the DDT. Prefetch the relevant DDT block so that * syncing context won't have to wait for the i/o. */ if (db->db_blkptr != NULL) { db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG); ddt_prefetch(os->os_spa, db->db_blkptr); dmu_buf_unlock_parent(db, dblt, FTAG); } /* * We need to hold the dn_struct_rwlock to make this assertion, * because it protects dn_phys / dn_next_nlevels from changing. */ ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) || dn->dn_phys->dn_nlevels > db->db_level || dn->dn_next_nlevels[txgoff] > db->db_level || dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level || dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level); if (db->db_level == 0) { ASSERT(!db->db_objset->os_raw_receive || dn->dn_maxblkid >= db->db_blkid); dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_rwlock, B_FALSE); ASSERT(dn->dn_maxblkid >= db->db_blkid); } if (db->db_level+1 < dn->dn_nlevels) { dmu_buf_impl_t *parent = db->db_parent; dbuf_dirty_record_t *di; int parent_held = FALSE; if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) { int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; parent = dbuf_hold_level(dn, db->db_level + 1, db->db_blkid >> epbs, FTAG); ASSERT(parent != NULL); parent_held = TRUE; } if (drop_struct_rwlock) rw_exit(&dn->dn_struct_rwlock); ASSERT3U(db->db_level + 1, ==, parent->db_level); di = dbuf_dirty(parent, tx); if (parent_held) dbuf_rele(parent, FTAG); mutex_enter(&db->db_mtx); /* * Since we've dropped the mutex, it's possible that * dbuf_undirty() might have changed this out from under us. */ if (list_head(&db->db_dirty_records) == dr || dn->dn_object == DMU_META_DNODE_OBJECT) { mutex_enter(&di->dt.di.dr_mtx); ASSERT3U(di->dr_txg, ==, tx->tx_txg); ASSERT(!list_link_active(&dr->dr_dirty_node)); list_insert_tail(&di->dt.di.dr_children, dr); mutex_exit(&di->dt.di.dr_mtx); dr->dr_parent = di; } mutex_exit(&db->db_mtx); } else { ASSERT(db->db_level + 1 == dn->dn_nlevels); ASSERT(db->db_blkid < dn->dn_nblkptr); ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf); mutex_enter(&dn->dn_mtx); ASSERT(!list_link_active(&dr->dr_dirty_node)); list_insert_tail(&dn->dn_dirty_records[txgoff], dr); mutex_exit(&dn->dn_mtx); if (drop_struct_rwlock) rw_exit(&dn->dn_struct_rwlock); } dnode_setdirty(dn, tx); DB_DNODE_EXIT(db); return (dr); } static void dbuf_undirty_bonus(dbuf_dirty_record_t *dr) { dmu_buf_impl_t *db = dr->dr_dbuf; if (dr->dt.dl.dr_data != db->db.db_data) { struct dnode *dn = dr->dr_dnode; int max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots); kmem_free(dr->dt.dl.dr_data, max_bonuslen); arc_space_return(max_bonuslen, ARC_SPACE_BONUS); } db->db_data_pending = NULL; ASSERT(list_next(&db->db_dirty_records, dr) == NULL); list_remove(&db->db_dirty_records, dr); if (dr->dr_dbuf->db_level != 0) { mutex_destroy(&dr->dt.di.dr_mtx); list_destroy(&dr->dt.di.dr_children); } kmem_free(dr, sizeof (dbuf_dirty_record_t)); ASSERT3U(db->db_dirtycnt, >, 0); db->db_dirtycnt -= 1; } /* * Undirty a buffer in the transaction group referenced by the given * transaction. Return whether this evicted the dbuf. */ static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) { uint64_t txg = tx->tx_txg; ASSERT(txg != 0); /* * Due to our use of dn_nlevels below, this can only be called * in open context, unless we are operating on the MOS. * From syncing context, dn_nlevels may be different from the * dn_nlevels used when dbuf was dirtied. */ ASSERT(db->db_objset == dmu_objset_pool(db->db_objset)->dp_meta_objset || txg != spa_syncing_txg(dmu_objset_spa(db->db_objset))); ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT0(db->db_level); ASSERT(MUTEX_HELD(&db->db_mtx)); /* * If this buffer is not dirty, we're done. */ dbuf_dirty_record_t *dr = dbuf_find_dirty_eq(db, txg); if (dr == NULL) return (B_FALSE); ASSERT(dr->dr_dbuf == db); dnode_t *dn = dr->dr_dnode; dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); ASSERT(db->db.db_size != 0); dsl_pool_undirty_space(dmu_objset_pool(dn->dn_objset), dr->dr_accounted, txg); list_remove(&db->db_dirty_records, dr); /* * Note that there are three places in dbuf_dirty() * where this dirty record may be put on a list. * Make sure to do a list_remove corresponding to * every one of those list_insert calls. */ if (dr->dr_parent) { mutex_enter(&dr->dr_parent->dt.di.dr_mtx); list_remove(&dr->dr_parent->dt.di.dr_children, dr); mutex_exit(&dr->dr_parent->dt.di.dr_mtx); } else if (db->db_blkid == DMU_SPILL_BLKID || db->db_level + 1 == dn->dn_nlevels) { ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf); mutex_enter(&dn->dn_mtx); list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr); mutex_exit(&dn->dn_mtx); } if (db->db_state != DB_NOFILL) { dbuf_unoverride(dr); ASSERT(db->db_buf != NULL); ASSERT(dr->dt.dl.dr_data != NULL); if (dr->dt.dl.dr_data != db->db_buf) arc_buf_destroy(dr->dt.dl.dr_data, db); } kmem_free(dr, sizeof (dbuf_dirty_record_t)); ASSERT(db->db_dirtycnt > 0); db->db_dirtycnt -= 1; if (zfs_refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) { ASSERT(db->db_state == DB_NOFILL || arc_released(db->db_buf)); dbuf_destroy(db); return (B_TRUE); } return (B_FALSE); } static void dmu_buf_will_dirty_impl(dmu_buf_t *db_fake, int flags, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; ASSERT(tx->tx_txg != 0); ASSERT(!zfs_refcount_is_zero(&db->db_holds)); /* * Quick check for dirtiness. For already dirty blocks, this * reduces runtime of this function by >90%, and overall performance * by 50% for some workloads (e.g. file deletion with indirect blocks * cached). */ mutex_enter(&db->db_mtx); if (db->db_state == DB_CACHED) { dbuf_dirty_record_t *dr = dbuf_find_dirty_eq(db, tx->tx_txg); /* * It's possible that it is already dirty but not cached, * because there are some calls to dbuf_dirty() that don't * go through dmu_buf_will_dirty(). */ if (dr != NULL) { /* This dbuf is already dirty and cached. */ dbuf_redirty(dr); mutex_exit(&db->db_mtx); return; } } mutex_exit(&db->db_mtx); DB_DNODE_ENTER(db); if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock)) flags |= DB_RF_HAVESTRUCT; DB_DNODE_EXIT(db); (void) dbuf_read(db, NULL, flags); (void) dbuf_dirty(db, tx); } void dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx) { dmu_buf_will_dirty_impl(db_fake, DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH, tx); } boolean_t dmu_buf_is_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dbuf_dirty_record_t *dr; mutex_enter(&db->db_mtx); dr = dbuf_find_dirty_eq(db, tx->tx_txg); mutex_exit(&db->db_mtx); return (dr != NULL); } void dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; db->db_state = DB_NOFILL; DTRACE_SET_STATE(db, "allocating NOFILL buffer"); dmu_buf_will_fill(db_fake, tx); } void dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT(tx->tx_txg != 0); ASSERT(db->db_level == 0); ASSERT(!zfs_refcount_is_zero(&db->db_holds)); ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx)); dbuf_noread(db); (void) dbuf_dirty(db, tx); } /* * This function is effectively the same as dmu_buf_will_dirty(), but * indicates the caller expects raw encrypted data in the db, and provides * the crypt params (byteorder, salt, iv, mac) which should be stored in the * blkptr_t when this dbuf is written. This is only used for blocks of * dnodes, during raw receive. */ void dmu_buf_set_crypt_params(dmu_buf_t *db_fake, boolean_t byteorder, const uint8_t *salt, const uint8_t *iv, const uint8_t *mac, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dbuf_dirty_record_t *dr; /* * dr_has_raw_params is only processed for blocks of dnodes * (see dbuf_sync_dnode_leaf_crypt()). */ ASSERT3U(db->db.db_object, ==, DMU_META_DNODE_OBJECT); ASSERT3U(db->db_level, ==, 0); ASSERT(db->db_objset->os_raw_receive); dmu_buf_will_dirty_impl(db_fake, DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_NO_DECRYPT, tx); dr = dbuf_find_dirty_eq(db, tx->tx_txg); ASSERT3P(dr, !=, NULL); dr->dt.dl.dr_has_raw_params = B_TRUE; dr->dt.dl.dr_byteorder = byteorder; memcpy(dr->dt.dl.dr_salt, salt, ZIO_DATA_SALT_LEN); memcpy(dr->dt.dl.dr_iv, iv, ZIO_DATA_IV_LEN); memcpy(dr->dt.dl.dr_mac, mac, ZIO_DATA_MAC_LEN); } static void dbuf_override_impl(dmu_buf_impl_t *db, const blkptr_t *bp, dmu_tx_t *tx) { struct dirty_leaf *dl; dbuf_dirty_record_t *dr; dr = list_head(&db->db_dirty_records); ASSERT3P(dr, !=, NULL); ASSERT3U(dr->dr_txg, ==, tx->tx_txg); dl = &dr->dt.dl; dl->dr_overridden_by = *bp; dl->dr_override_state = DR_OVERRIDDEN; dl->dr_overridden_by.blk_birth = dr->dr_txg; } void dmu_buf_fill_done(dmu_buf_t *dbuf, dmu_tx_t *tx) { (void) tx; dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf; dbuf_states_t old_state; mutex_enter(&db->db_mtx); DBUF_VERIFY(db); old_state = db->db_state; db->db_state = DB_CACHED; if (old_state == DB_FILL) { if (db->db_level == 0 && db->db_freed_in_flight) { ASSERT(db->db_blkid != DMU_BONUS_BLKID); /* we were freed while filling */ /* XXX dbuf_undirty? */ memset(db->db.db_data, 0, db->db.db_size); db->db_freed_in_flight = FALSE; DTRACE_SET_STATE(db, "fill done handling freed in flight"); } else { DTRACE_SET_STATE(db, "fill done"); } cv_broadcast(&db->db_changed); } mutex_exit(&db->db_mtx); } void dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data, bp_embedded_type_t etype, enum zio_compress comp, int uncompressed_size, int compressed_size, int byteorder, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf; struct dirty_leaf *dl; dmu_object_type_t type; dbuf_dirty_record_t *dr; if (etype == BP_EMBEDDED_TYPE_DATA) { ASSERT(spa_feature_is_active(dmu_objset_spa(db->db_objset), SPA_FEATURE_EMBEDDED_DATA)); } DB_DNODE_ENTER(db); type = DB_DNODE(db)->dn_type; DB_DNODE_EXIT(db); ASSERT0(db->db_level); ASSERT(db->db_blkid != DMU_BONUS_BLKID); dmu_buf_will_not_fill(dbuf, tx); dr = list_head(&db->db_dirty_records); ASSERT3P(dr, !=, NULL); ASSERT3U(dr->dr_txg, ==, tx->tx_txg); dl = &dr->dt.dl; encode_embedded_bp_compressed(&dl->dr_overridden_by, data, comp, uncompressed_size, compressed_size); BPE_SET_ETYPE(&dl->dr_overridden_by, etype); BP_SET_TYPE(&dl->dr_overridden_by, type); BP_SET_LEVEL(&dl->dr_overridden_by, 0); BP_SET_BYTEORDER(&dl->dr_overridden_by, byteorder); dl->dr_override_state = DR_OVERRIDDEN; dl->dr_overridden_by.blk_birth = dr->dr_txg; } void dmu_buf_redact(dmu_buf_t *dbuf, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf; dmu_object_type_t type; ASSERT(dsl_dataset_feature_is_active(db->db_objset->os_dsl_dataset, SPA_FEATURE_REDACTED_DATASETS)); DB_DNODE_ENTER(db); type = DB_DNODE(db)->dn_type; DB_DNODE_EXIT(db); ASSERT0(db->db_level); dmu_buf_will_not_fill(dbuf, tx); blkptr_t bp = { { { {0} } } }; BP_SET_TYPE(&bp, type); BP_SET_LEVEL(&bp, 0); BP_SET_BIRTH(&bp, tx->tx_txg, 0); BP_SET_REDACTED(&bp); BPE_SET_LSIZE(&bp, dbuf->db_size); dbuf_override_impl(db, &bp, tx); } /* * Directly assign a provided arc buf to a given dbuf if it's not referenced * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf. */ void dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) { ASSERT(!zfs_refcount_is_zero(&db->db_holds)); ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT(db->db_level == 0); ASSERT3U(dbuf_is_metadata(db), ==, arc_is_metadata(buf)); ASSERT(buf != NULL); ASSERT3U(arc_buf_lsize(buf), ==, db->db.db_size); ASSERT(tx->tx_txg != 0); arc_return_buf(buf, db); ASSERT(arc_released(buf)); mutex_enter(&db->db_mtx); while (db->db_state == DB_READ || db->db_state == DB_FILL) cv_wait(&db->db_changed, &db->db_mtx); ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED); if (db->db_state == DB_CACHED && zfs_refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) { /* * In practice, we will never have a case where we have an * encrypted arc buffer while additional holds exist on the * dbuf. We don't handle this here so we simply assert that * fact instead. */ ASSERT(!arc_is_encrypted(buf)); mutex_exit(&db->db_mtx); (void) dbuf_dirty(db, tx); memcpy(db->db.db_data, buf->b_data, db->db.db_size); arc_buf_destroy(buf, db); return; } if (db->db_state == DB_CACHED) { dbuf_dirty_record_t *dr = list_head(&db->db_dirty_records); ASSERT(db->db_buf != NULL); if (dr != NULL && dr->dr_txg == tx->tx_txg) { ASSERT(dr->dt.dl.dr_data == db->db_buf); if (!arc_released(db->db_buf)) { ASSERT(dr->dt.dl.dr_override_state == DR_OVERRIDDEN); arc_release(db->db_buf, db); } dr->dt.dl.dr_data = buf; arc_buf_destroy(db->db_buf, db); } else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) { arc_release(db->db_buf, db); arc_buf_destroy(db->db_buf, db); } db->db_buf = NULL; } ASSERT(db->db_buf == NULL); dbuf_set_data(db, buf); db->db_state = DB_FILL; DTRACE_SET_STATE(db, "filling assigned arcbuf"); mutex_exit(&db->db_mtx); (void) dbuf_dirty(db, tx); dmu_buf_fill_done(&db->db, tx); } void dbuf_destroy(dmu_buf_impl_t *db) { dnode_t *dn; dmu_buf_impl_t *parent = db->db_parent; dmu_buf_impl_t *dndb; ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(zfs_refcount_is_zero(&db->db_holds)); if (db->db_buf != NULL) { arc_buf_destroy(db->db_buf, db); db->db_buf = NULL; } if (db->db_blkid == DMU_BONUS_BLKID) { int slots = DB_DNODE(db)->dn_num_slots; int bonuslen = DN_SLOTS_TO_BONUSLEN(slots); if (db->db.db_data != NULL) { kmem_free(db->db.db_data, bonuslen); arc_space_return(bonuslen, ARC_SPACE_BONUS); db->db_state = DB_UNCACHED; DTRACE_SET_STATE(db, "buffer cleared"); } } dbuf_clear_data(db); if (multilist_link_active(&db->db_cache_link)) { ASSERT(db->db_caching_status == DB_DBUF_CACHE || db->db_caching_status == DB_DBUF_METADATA_CACHE); multilist_remove(&dbuf_caches[db->db_caching_status].cache, db); (void) zfs_refcount_remove_many( &dbuf_caches[db->db_caching_status].size, db->db.db_size, db); if (db->db_caching_status == DB_DBUF_METADATA_CACHE) { DBUF_STAT_BUMPDOWN(metadata_cache_count); } else { DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]); DBUF_STAT_BUMPDOWN(cache_count); DBUF_STAT_DECR(cache_levels_bytes[db->db_level], db->db.db_size); } db->db_caching_status = DB_NO_CACHE; } ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL); ASSERT(db->db_data_pending == NULL); ASSERT(list_is_empty(&db->db_dirty_records)); db->db_state = DB_EVICTING; DTRACE_SET_STATE(db, "buffer eviction started"); db->db_blkptr = NULL; /* * Now that db_state is DB_EVICTING, nobody else can find this via * the hash table. We can now drop db_mtx, which allows us to * acquire the dn_dbufs_mtx. */ mutex_exit(&db->db_mtx); DB_DNODE_ENTER(db); dn = DB_DNODE(db); dndb = dn->dn_dbuf; if (db->db_blkid != DMU_BONUS_BLKID) { boolean_t needlock = !MUTEX_HELD(&dn->dn_dbufs_mtx); if (needlock) mutex_enter_nested(&dn->dn_dbufs_mtx, NESTED_SINGLE); avl_remove(&dn->dn_dbufs, db); membar_producer(); DB_DNODE_EXIT(db); if (needlock) mutex_exit(&dn->dn_dbufs_mtx); /* * Decrementing the dbuf count means that the hold corresponding * to the removed dbuf is no longer discounted in dnode_move(), * so the dnode cannot be moved until after we release the hold. * The membar_producer() ensures visibility of the decremented * value in dnode_move(), since DB_DNODE_EXIT doesn't actually * release any lock. */ mutex_enter(&dn->dn_mtx); dnode_rele_and_unlock(dn, db, B_TRUE); db->db_dnode_handle = NULL; dbuf_hash_remove(db); } else { DB_DNODE_EXIT(db); } ASSERT(zfs_refcount_is_zero(&db->db_holds)); db->db_parent = NULL; ASSERT(db->db_buf == NULL); ASSERT(db->db.db_data == NULL); ASSERT(db->db_hash_next == NULL); ASSERT(db->db_blkptr == NULL); ASSERT(db->db_data_pending == NULL); ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE); ASSERT(!multilist_link_active(&db->db_cache_link)); /* * If this dbuf is referenced from an indirect dbuf, * decrement the ref count on the indirect dbuf. */ if (parent && parent != dndb) { mutex_enter(&parent->db_mtx); dbuf_rele_and_unlock(parent, db, B_TRUE); } kmem_cache_free(dbuf_kmem_cache, db); arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF); } /* * Note: While bpp will always be updated if the function returns success, * parentp will not be updated if the dnode does not have dn_dbuf filled in; * this happens when the dnode is the meta-dnode, or {user|group|project}used * object. */ __attribute__((always_inline)) static inline int dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse, dmu_buf_impl_t **parentp, blkptr_t **bpp) { *parentp = NULL; *bpp = NULL; ASSERT(blkid != DMU_BONUS_BLKID); if (blkid == DMU_SPILL_BLKID) { mutex_enter(&dn->dn_mtx); if (dn->dn_have_spill && (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) *bpp = DN_SPILL_BLKPTR(dn->dn_phys); else *bpp = NULL; dbuf_add_ref(dn->dn_dbuf, NULL); *parentp = dn->dn_dbuf; mutex_exit(&dn->dn_mtx); return (0); } int nlevels = (dn->dn_phys->dn_nlevels == 0) ? 1 : dn->dn_phys->dn_nlevels; int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; ASSERT3U(level * epbs, <, 64); ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); /* * This assertion shouldn't trip as long as the max indirect block size * is less than 1M. The reason for this is that up to that point, * the number of levels required to address an entire object with blocks * of size SPA_MINBLOCKSIZE satisfies nlevels * epbs + 1 <= 64. In * other words, if N * epbs + 1 > 64, then if (N-1) * epbs + 1 > 55 * (i.e. we can address the entire object), objects will all use at most * N-1 levels and the assertion won't overflow. However, once epbs is * 13, 4 * 13 + 1 = 53, but 5 * 13 + 1 = 66. Then, 4 levels will not be * enough to address an entire object, so objects will have 5 levels, * but then this assertion will overflow. * * All this is to say that if we ever increase DN_MAX_INDBLKSHIFT, we * need to redo this logic to handle overflows. */ ASSERT(level >= nlevels || ((nlevels - level - 1) * epbs) + highbit64(dn->dn_phys->dn_nblkptr) <= 64); if (level >= nlevels || blkid >= ((uint64_t)dn->dn_phys->dn_nblkptr << ((nlevels - level - 1) * epbs)) || (fail_sparse && blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) { /* the buffer has no parent yet */ return (SET_ERROR(ENOENT)); } else if (level < nlevels-1) { /* this block is referenced from an indirect block */ int err; err = dbuf_hold_impl(dn, level + 1, blkid >> epbs, fail_sparse, FALSE, NULL, parentp); if (err) return (err); err = dbuf_read(*parentp, NULL, (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL)); if (err) { dbuf_rele(*parentp, NULL); *parentp = NULL; return (err); } rw_enter(&(*parentp)->db_rwlock, RW_READER); *bpp = ((blkptr_t *)(*parentp)->db.db_data) + (blkid & ((1ULL << epbs) - 1)); if (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs))) ASSERT(BP_IS_HOLE(*bpp)); rw_exit(&(*parentp)->db_rwlock); return (0); } else { /* the block is referenced from the dnode */ ASSERT3U(level, ==, nlevels-1); ASSERT(dn->dn_phys->dn_nblkptr == 0 || blkid < dn->dn_phys->dn_nblkptr); if (dn->dn_dbuf) { dbuf_add_ref(dn->dn_dbuf, NULL); *parentp = dn->dn_dbuf; } *bpp = &dn->dn_phys->dn_blkptr[blkid]; return (0); } } static dmu_buf_impl_t * dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, dmu_buf_impl_t *parent, blkptr_t *blkptr) { objset_t *os = dn->dn_objset; dmu_buf_impl_t *db, *odb; ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); ASSERT(dn->dn_type != DMU_OT_NONE); db = kmem_cache_alloc(dbuf_kmem_cache, KM_SLEEP); list_create(&db->db_dirty_records, sizeof (dbuf_dirty_record_t), offsetof(dbuf_dirty_record_t, dr_dbuf_node)); db->db_objset = os; db->db.db_object = dn->dn_object; db->db_level = level; db->db_blkid = blkid; db->db_dirtycnt = 0; db->db_dnode_handle = dn->dn_handle; db->db_parent = parent; db->db_blkptr = blkptr; db->db_user = NULL; db->db_user_immediate_evict = FALSE; db->db_freed_in_flight = FALSE; db->db_pending_evict = FALSE; if (blkid == DMU_BONUS_BLKID) { ASSERT3P(parent, ==, dn->dn_dbuf); db->db.db_size = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) - (dn->dn_nblkptr-1) * sizeof (blkptr_t); ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); db->db.db_offset = DMU_BONUS_BLKID; db->db_state = DB_UNCACHED; DTRACE_SET_STATE(db, "bonus buffer created"); db->db_caching_status = DB_NO_CACHE; /* the bonus dbuf is not placed in the hash table */ arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF); return (db); } else if (blkid == DMU_SPILL_BLKID) { db->db.db_size = (blkptr != NULL) ? BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE; db->db.db_offset = 0; } else { int blocksize = db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz; db->db.db_size = blocksize; db->db.db_offset = db->db_blkid * blocksize; } /* * Hold the dn_dbufs_mtx while we get the new dbuf * in the hash table *and* added to the dbufs list. * This prevents a possible deadlock with someone * trying to look up this dbuf before it's added to the * dn_dbufs list. */ mutex_enter(&dn->dn_dbufs_mtx); db->db_state = DB_EVICTING; /* not worth logging this state change */ if ((odb = dbuf_hash_insert(db)) != NULL) { /* someone else inserted it first */ mutex_exit(&dn->dn_dbufs_mtx); kmem_cache_free(dbuf_kmem_cache, db); DBUF_STAT_BUMP(hash_insert_race); return (odb); } avl_add(&dn->dn_dbufs, db); db->db_state = DB_UNCACHED; DTRACE_SET_STATE(db, "regular buffer created"); db->db_caching_status = DB_NO_CACHE; mutex_exit(&dn->dn_dbufs_mtx); arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF); if (parent && parent != dn->dn_dbuf) dbuf_add_ref(parent, db); ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || zfs_refcount_count(&dn->dn_holds) > 0); (void) zfs_refcount_add(&dn->dn_holds, db); dprintf_dbuf(db, "db=%p\n", db); return (db); } /* * This function returns a block pointer and information about the object, * given a dnode and a block. This is a publicly accessible version of * dbuf_findbp that only returns some information, rather than the * dbuf. Note that the dnode passed in must be held, and the dn_struct_rwlock * should be locked as (at least) a reader. */ int dbuf_dnode_findbp(dnode_t *dn, uint64_t level, uint64_t blkid, blkptr_t *bp, uint16_t *datablkszsec, uint8_t *indblkshift) { dmu_buf_impl_t *dbp = NULL; blkptr_t *bp2; int err = 0; ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); err = dbuf_findbp(dn, level, blkid, B_FALSE, &dbp, &bp2); if (err == 0) { *bp = *bp2; if (dbp != NULL) dbuf_rele(dbp, NULL); if (datablkszsec != NULL) *datablkszsec = dn->dn_phys->dn_datablkszsec; if (indblkshift != NULL) *indblkshift = dn->dn_phys->dn_indblkshift; } return (err); } typedef struct dbuf_prefetch_arg { spa_t *dpa_spa; /* The spa to issue the prefetch in. */ zbookmark_phys_t dpa_zb; /* The target block to prefetch. */ int dpa_epbs; /* Entries (blkptr_t's) Per Block Shift. */ int dpa_curlevel; /* The current level that we're reading */ dnode_t *dpa_dnode; /* The dnode associated with the prefetch */ zio_priority_t dpa_prio; /* The priority I/Os should be issued at. */ zio_t *dpa_zio; /* The parent zio_t for all prefetches. */ arc_flags_t dpa_aflags; /* Flags to pass to the final prefetch. */ dbuf_prefetch_fn dpa_cb; /* prefetch completion callback */ void *dpa_arg; /* prefetch completion arg */ } dbuf_prefetch_arg_t; static void dbuf_prefetch_fini(dbuf_prefetch_arg_t *dpa, boolean_t io_done) { if (dpa->dpa_cb != NULL) { dpa->dpa_cb(dpa->dpa_arg, dpa->dpa_zb.zb_level, dpa->dpa_zb.zb_blkid, io_done); } kmem_free(dpa, sizeof (*dpa)); } static void dbuf_issue_final_prefetch_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *iobp, arc_buf_t *abuf, void *private) { (void) zio, (void) zb, (void) iobp; dbuf_prefetch_arg_t *dpa = private; if (abuf != NULL) arc_buf_destroy(abuf, private); dbuf_prefetch_fini(dpa, B_TRUE); } /* * Actually issue the prefetch read for the block given. */ static void dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp) { ASSERT(!BP_IS_REDACTED(bp) || dsl_dataset_feature_is_active( dpa->dpa_dnode->dn_objset->os_dsl_dataset, SPA_FEATURE_REDACTED_DATASETS)); if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp) || BP_IS_REDACTED(bp)) return (dbuf_prefetch_fini(dpa, B_FALSE)); int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE; arc_flags_t aflags = dpa->dpa_aflags | ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH | ARC_FLAG_NO_BUF; /* dnodes are always read as raw and then converted later */ if (BP_GET_TYPE(bp) == DMU_OT_DNODE && BP_IS_PROTECTED(bp) && dpa->dpa_curlevel == 0) zio_flags |= ZIO_FLAG_RAW; ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp)); ASSERT3U(dpa->dpa_curlevel, ==, dpa->dpa_zb.zb_level); ASSERT(dpa->dpa_zio != NULL); (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, bp, dbuf_issue_final_prefetch_done, dpa, dpa->dpa_prio, zio_flags, &aflags, &dpa->dpa_zb); } /* * Called when an indirect block above our prefetch target is read in. This * will either read in the next indirect block down the tree or issue the actual * prefetch if the next block down is our target. */ static void dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *iobp, arc_buf_t *abuf, void *private) { (void) zb, (void) iobp; dbuf_prefetch_arg_t *dpa = private; ASSERT3S(dpa->dpa_zb.zb_level, <, dpa->dpa_curlevel); ASSERT3S(dpa->dpa_curlevel, >, 0); if (abuf == NULL) { ASSERT(zio == NULL || zio->io_error != 0); dbuf_prefetch_fini(dpa, B_TRUE); return; } ASSERT(zio == NULL || zio->io_error == 0); /* * The dpa_dnode is only valid if we are called with a NULL * zio. This indicates that the arc_read() returned without * first calling zio_read() to issue a physical read. Once * a physical read is made the dpa_dnode must be invalidated * as the locks guarding it may have been dropped. If the * dpa_dnode is still valid, then we want to add it to the dbuf * cache. To do so, we must hold the dbuf associated with the block * we just prefetched, read its contents so that we associate it * with an arc_buf_t, and then release it. */ if (zio != NULL) { ASSERT3S(BP_GET_LEVEL(zio->io_bp), ==, dpa->dpa_curlevel); if (zio->io_flags & ZIO_FLAG_RAW_COMPRESS) { ASSERT3U(BP_GET_PSIZE(zio->io_bp), ==, zio->io_size); } else { ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, zio->io_size); } ASSERT3P(zio->io_spa, ==, dpa->dpa_spa); dpa->dpa_dnode = NULL; } else if (dpa->dpa_dnode != NULL) { uint64_t curblkid = dpa->dpa_zb.zb_blkid >> (dpa->dpa_epbs * (dpa->dpa_curlevel - dpa->dpa_zb.zb_level)); dmu_buf_impl_t *db = dbuf_hold_level(dpa->dpa_dnode, dpa->dpa_curlevel, curblkid, FTAG); if (db == NULL) { arc_buf_destroy(abuf, private); dbuf_prefetch_fini(dpa, B_TRUE); return; } (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_HAVESTRUCT); dbuf_rele(db, FTAG); } dpa->dpa_curlevel--; uint64_t nextblkid = dpa->dpa_zb.zb_blkid >> (dpa->dpa_epbs * (dpa->dpa_curlevel - dpa->dpa_zb.zb_level)); blkptr_t *bp = ((blkptr_t *)abuf->b_data) + P2PHASE(nextblkid, 1ULL << dpa->dpa_epbs); ASSERT(!BP_IS_REDACTED(bp) || dsl_dataset_feature_is_active( dpa->dpa_dnode->dn_objset->os_dsl_dataset, SPA_FEATURE_REDACTED_DATASETS)); if (BP_IS_HOLE(bp) || BP_IS_REDACTED(bp)) { arc_buf_destroy(abuf, private); dbuf_prefetch_fini(dpa, B_TRUE); return; } else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) { ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid); dbuf_issue_final_prefetch(dpa, bp); } else { arc_flags_t iter_aflags = ARC_FLAG_NOWAIT; zbookmark_phys_t zb; /* flag if L2ARC eligible, l2arc_noprefetch then decides */ if (dpa->dpa_aflags & ARC_FLAG_L2CACHE) iter_aflags |= ARC_FLAG_L2CACHE; ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp)); SET_BOOKMARK(&zb, dpa->dpa_zb.zb_objset, dpa->dpa_zb.zb_object, dpa->dpa_curlevel, nextblkid); (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, bp, dbuf_prefetch_indirect_done, dpa, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &iter_aflags, &zb); } arc_buf_destroy(abuf, private); } /* * Issue prefetch reads for the given block on the given level. If the indirect * blocks above that block are not in memory, we will read them in * asynchronously. As a result, this call never blocks waiting for a read to * complete. Note that the prefetch might fail if the dataset is encrypted and * the encryption key is unmapped before the IO completes. */ int dbuf_prefetch_impl(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio, arc_flags_t aflags, dbuf_prefetch_fn cb, void *arg) { blkptr_t bp; int epbs, nlevels, curlevel; uint64_t curblkid; ASSERT(blkid != DMU_BONUS_BLKID); ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); if (blkid > dn->dn_maxblkid) goto no_issue; if (level == 0 && dnode_block_freed(dn, blkid)) goto no_issue; /* * This dnode hasn't been written to disk yet, so there's nothing to * prefetch. */ nlevels = dn->dn_phys->dn_nlevels; if (level >= nlevels || dn->dn_phys->dn_nblkptr == 0) goto no_issue; epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; if (dn->dn_phys->dn_maxblkid < blkid << (epbs * level)) goto no_issue; dmu_buf_impl_t *db = dbuf_find(dn->dn_objset, dn->dn_object, level, blkid); if (db != NULL) { mutex_exit(&db->db_mtx); /* * This dbuf already exists. It is either CACHED, or * (we assume) about to be read or filled. */ goto no_issue; } /* * Find the closest ancestor (indirect block) of the target block * that is present in the cache. In this indirect block, we will * find the bp that is at curlevel, curblkid. */ curlevel = level; curblkid = blkid; while (curlevel < nlevels - 1) { int parent_level = curlevel + 1; uint64_t parent_blkid = curblkid >> epbs; dmu_buf_impl_t *db; if (dbuf_hold_impl(dn, parent_level, parent_blkid, FALSE, TRUE, FTAG, &db) == 0) { blkptr_t *bpp = db->db_buf->b_data; bp = bpp[P2PHASE(curblkid, 1 << epbs)]; dbuf_rele(db, FTAG); break; } curlevel = parent_level; curblkid = parent_blkid; } if (curlevel == nlevels - 1) { /* No cached indirect blocks found. */ ASSERT3U(curblkid, <, dn->dn_phys->dn_nblkptr); bp = dn->dn_phys->dn_blkptr[curblkid]; } ASSERT(!BP_IS_REDACTED(&bp) || dsl_dataset_feature_is_active(dn->dn_objset->os_dsl_dataset, SPA_FEATURE_REDACTED_DATASETS)); if (BP_IS_HOLE(&bp) || BP_IS_REDACTED(&bp)) goto no_issue; ASSERT3U(curlevel, ==, BP_GET_LEVEL(&bp)); zio_t *pio = zio_root(dmu_objset_spa(dn->dn_objset), NULL, NULL, ZIO_FLAG_CANFAIL); dbuf_prefetch_arg_t *dpa = kmem_zalloc(sizeof (*dpa), KM_SLEEP); dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; SET_BOOKMARK(&dpa->dpa_zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET, dn->dn_object, level, blkid); dpa->dpa_curlevel = curlevel; dpa->dpa_prio = prio; dpa->dpa_aflags = aflags; dpa->dpa_spa = dn->dn_objset->os_spa; dpa->dpa_dnode = dn; dpa->dpa_epbs = epbs; dpa->dpa_zio = pio; dpa->dpa_cb = cb; dpa->dpa_arg = arg; /* flag if L2ARC eligible, l2arc_noprefetch then decides */ if (dnode_level_is_l2cacheable(&bp, dn, level)) dpa->dpa_aflags |= ARC_FLAG_L2CACHE; /* * If we have the indirect just above us, no need to do the asynchronous * prefetch chain; we'll just run the last step ourselves. If we're at * a higher level, though, we want to issue the prefetches for all the * indirect blocks asynchronously, so we can go on with whatever we were * doing. */ if (curlevel == level) { ASSERT3U(curblkid, ==, blkid); dbuf_issue_final_prefetch(dpa, &bp); } else { arc_flags_t iter_aflags = ARC_FLAG_NOWAIT; zbookmark_phys_t zb; /* flag if L2ARC eligible, l2arc_noprefetch then decides */ if (dnode_level_is_l2cacheable(&bp, dn, level)) iter_aflags |= ARC_FLAG_L2CACHE; SET_BOOKMARK(&zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET, dn->dn_object, curlevel, curblkid); (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, &bp, dbuf_prefetch_indirect_done, dpa, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &iter_aflags, &zb); } /* * We use pio here instead of dpa_zio since it's possible that * dpa may have already been freed. */ zio_nowait(pio); return (1); no_issue: if (cb != NULL) cb(arg, level, blkid, B_FALSE); return (0); } int dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio, arc_flags_t aflags) { return (dbuf_prefetch_impl(dn, level, blkid, prio, aflags, NULL, NULL)); } /* * Helper function for dbuf_hold_impl() to copy a buffer. Handles * the case of encrypted, compressed and uncompressed buffers by * allocating the new buffer, respectively, with arc_alloc_raw_buf(), * arc_alloc_compressed_buf() or arc_alloc_buf().* * * NOTE: Declared noinline to avoid stack bloat in dbuf_hold_impl(). */ noinline static void dbuf_hold_copy(dnode_t *dn, dmu_buf_impl_t *db) { dbuf_dirty_record_t *dr = db->db_data_pending; arc_buf_t *data = dr->dt.dl.dr_data; enum zio_compress compress_type = arc_get_compression(data); uint8_t complevel = arc_get_complevel(data); if (arc_is_encrypted(data)) { boolean_t byteorder; uint8_t salt[ZIO_DATA_SALT_LEN]; uint8_t iv[ZIO_DATA_IV_LEN]; uint8_t mac[ZIO_DATA_MAC_LEN]; arc_get_raw_params(data, &byteorder, salt, iv, mac); dbuf_set_data(db, arc_alloc_raw_buf(dn->dn_objset->os_spa, db, dmu_objset_id(dn->dn_objset), byteorder, salt, iv, mac, dn->dn_type, arc_buf_size(data), arc_buf_lsize(data), compress_type, complevel)); } else if (compress_type != ZIO_COMPRESS_OFF) { dbuf_set_data(db, arc_alloc_compressed_buf( dn->dn_objset->os_spa, db, arc_buf_size(data), arc_buf_lsize(data), compress_type, complevel)); } else { dbuf_set_data(db, arc_alloc_buf(dn->dn_objset->os_spa, db, DBUF_GET_BUFC_TYPE(db), db->db.db_size)); } rw_enter(&db->db_rwlock, RW_WRITER); memcpy(db->db.db_data, data->b_data, arc_buf_size(data)); rw_exit(&db->db_rwlock); } /* * Returns with db_holds incremented, and db_mtx not held. * Note: dn_struct_rwlock must be held. */ int dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, boolean_t fail_sparse, boolean_t fail_uncached, const void *tag, dmu_buf_impl_t **dbp) { dmu_buf_impl_t *db, *parent = NULL; /* If the pool has been created, verify the tx_sync_lock is not held */ spa_t *spa = dn->dn_objset->os_spa; dsl_pool_t *dp = spa->spa_dsl_pool; if (dp != NULL) { ASSERT(!MUTEX_HELD(&dp->dp_tx.tx_sync_lock)); } ASSERT(blkid != DMU_BONUS_BLKID); ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); ASSERT3U(dn->dn_nlevels, >, level); *dbp = NULL; /* dbuf_find() returns with db_mtx held */ db = dbuf_find(dn->dn_objset, dn->dn_object, level, blkid); if (db == NULL) { blkptr_t *bp = NULL; int err; if (fail_uncached) return (SET_ERROR(ENOENT)); ASSERT3P(parent, ==, NULL); err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp); if (fail_sparse) { if (err == 0 && bp && BP_IS_HOLE(bp)) err = SET_ERROR(ENOENT); if (err) { if (parent) dbuf_rele(parent, NULL); return (err); } } if (err && err != ENOENT) return (err); db = dbuf_create(dn, level, blkid, parent, bp); } if (fail_uncached && db->db_state != DB_CACHED) { mutex_exit(&db->db_mtx); return (SET_ERROR(ENOENT)); } if (db->db_buf != NULL) { arc_buf_access(db->db_buf); ASSERT3P(db->db.db_data, ==, db->db_buf->b_data); } ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf)); /* * If this buffer is currently syncing out, and we are * still referencing it from db_data, we need to make a copy * of it in case we decide we want to dirty it again in this txg. */ if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && dn->dn_object != DMU_META_DNODE_OBJECT && db->db_state == DB_CACHED && db->db_data_pending) { dbuf_dirty_record_t *dr = db->db_data_pending; if (dr->dt.dl.dr_data == db->db_buf) dbuf_hold_copy(dn, db); } if (multilist_link_active(&db->db_cache_link)) { ASSERT(zfs_refcount_is_zero(&db->db_holds)); ASSERT(db->db_caching_status == DB_DBUF_CACHE || db->db_caching_status == DB_DBUF_METADATA_CACHE); multilist_remove(&dbuf_caches[db->db_caching_status].cache, db); (void) zfs_refcount_remove_many( &dbuf_caches[db->db_caching_status].size, db->db.db_size, db); if (db->db_caching_status == DB_DBUF_METADATA_CACHE) { DBUF_STAT_BUMPDOWN(metadata_cache_count); } else { DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]); DBUF_STAT_BUMPDOWN(cache_count); DBUF_STAT_DECR(cache_levels_bytes[db->db_level], db->db.db_size); } db->db_caching_status = DB_NO_CACHE; } (void) zfs_refcount_add(&db->db_holds, tag); DBUF_VERIFY(db); mutex_exit(&db->db_mtx); /* NOTE: we can't rele the parent until after we drop the db_mtx */ if (parent) dbuf_rele(parent, NULL); ASSERT3P(DB_DNODE(db), ==, dn); ASSERT3U(db->db_blkid, ==, blkid); ASSERT3U(db->db_level, ==, level); *dbp = db; return (0); } dmu_buf_impl_t * dbuf_hold(dnode_t *dn, uint64_t blkid, const void *tag) { return (dbuf_hold_level(dn, 0, blkid, tag)); } dmu_buf_impl_t * dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, const void *tag) { dmu_buf_impl_t *db; int err = dbuf_hold_impl(dn, level, blkid, FALSE, FALSE, tag, &db); return (err ? NULL : db); } void dbuf_create_bonus(dnode_t *dn) { ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); ASSERT(dn->dn_bonus == NULL); dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL); } int dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; if (db->db_blkid != DMU_SPILL_BLKID) return (SET_ERROR(ENOTSUP)); if (blksz == 0) blksz = SPA_MINBLOCKSIZE; ASSERT3U(blksz, <=, spa_maxblocksize(dmu_objset_spa(db->db_objset))); blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE); dbuf_new_size(db, blksz, tx); return (0); } void dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx) { dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx); } #pragma weak dmu_buf_add_ref = dbuf_add_ref void dbuf_add_ref(dmu_buf_impl_t *db, const void *tag) { int64_t holds = zfs_refcount_add(&db->db_holds, tag); VERIFY3S(holds, >, 1); } #pragma weak dmu_buf_try_add_ref = dbuf_try_add_ref boolean_t dbuf_try_add_ref(dmu_buf_t *db_fake, objset_t *os, uint64_t obj, uint64_t blkid, const void *tag) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dmu_buf_impl_t *found_db; boolean_t result = B_FALSE; if (blkid == DMU_BONUS_BLKID) found_db = dbuf_find_bonus(os, obj); else found_db = dbuf_find(os, obj, 0, blkid); if (found_db != NULL) { if (db == found_db && dbuf_refcount(db) > db->db_dirtycnt) { (void) zfs_refcount_add(&db->db_holds, tag); result = B_TRUE; } mutex_exit(&found_db->db_mtx); } return (result); } /* * If you call dbuf_rele() you had better not be referencing the dnode handle * unless you have some other direct or indirect hold on the dnode. (An indirect * hold is a hold on one of the dnode's dbufs, including the bonus buffer.) * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the * dnode's parent dbuf evicting its dnode handles. */ void dbuf_rele(dmu_buf_impl_t *db, const void *tag) { mutex_enter(&db->db_mtx); dbuf_rele_and_unlock(db, tag, B_FALSE); } void dmu_buf_rele(dmu_buf_t *db, const void *tag) { dbuf_rele((dmu_buf_impl_t *)db, tag); } /* * dbuf_rele() for an already-locked dbuf. This is necessary to allow * db_dirtycnt and db_holds to be updated atomically. The 'evicting' * argument should be set if we are already in the dbuf-evicting code * path, in which case we don't want to recursively evict. This allows us to * avoid deeply nested stacks that would have a call flow similar to this: * * dbuf_rele()-->dbuf_rele_and_unlock()-->dbuf_evict_notify() * ^ | * | | * +-----dbuf_destroy()<--dbuf_evict_one()<--------+ * */ void dbuf_rele_and_unlock(dmu_buf_impl_t *db, const void *tag, boolean_t evicting) { int64_t holds; uint64_t size; ASSERT(MUTEX_HELD(&db->db_mtx)); DBUF_VERIFY(db); /* * Remove the reference to the dbuf before removing its hold on the * dnode so we can guarantee in dnode_move() that a referenced bonus * buffer has a corresponding dnode hold. */ holds = zfs_refcount_remove(&db->db_holds, tag); ASSERT(holds >= 0); /* * We can't freeze indirects if there is a possibility that they * may be modified in the current syncing context. */ if (db->db_buf != NULL && holds == (db->db_level == 0 ? db->db_dirtycnt : 0)) { arc_buf_freeze(db->db_buf); } if (holds == db->db_dirtycnt && db->db_level == 0 && db->db_user_immediate_evict) dbuf_evict_user(db); if (holds == 0) { if (db->db_blkid == DMU_BONUS_BLKID) { dnode_t *dn; boolean_t evict_dbuf = db->db_pending_evict; /* * If the dnode moves here, we cannot cross this * barrier until the move completes. */ DB_DNODE_ENTER(db); dn = DB_DNODE(db); atomic_dec_32(&dn->dn_dbufs_count); /* * Decrementing the dbuf count means that the bonus * buffer's dnode hold is no longer discounted in * dnode_move(). The dnode cannot move until after * the dnode_rele() below. */ DB_DNODE_EXIT(db); /* * Do not reference db after its lock is dropped. * Another thread may evict it. */ mutex_exit(&db->db_mtx); if (evict_dbuf) dnode_evict_bonus(dn); dnode_rele(dn, db); } else if (db->db_buf == NULL) { /* * This is a special case: we never associated this * dbuf with any data allocated from the ARC. */ ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL); dbuf_destroy(db); } else if (arc_released(db->db_buf)) { /* * This dbuf has anonymous data associated with it. */ dbuf_destroy(db); } else { boolean_t do_arc_evict = B_FALSE; blkptr_t bp; spa_t *spa = dmu_objset_spa(db->db_objset); if (!DBUF_IS_CACHEABLE(db) && db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr) && !BP_IS_EMBEDDED(db->db_blkptr)) { do_arc_evict = B_TRUE; bp = *db->db_blkptr; } if (!DBUF_IS_CACHEABLE(db) || db->db_pending_evict) { dbuf_destroy(db); } else if (!multilist_link_active(&db->db_cache_link)) { ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE); dbuf_cached_state_t dcs = dbuf_include_in_metadata_cache(db) ? DB_DBUF_METADATA_CACHE : DB_DBUF_CACHE; db->db_caching_status = dcs; multilist_insert(&dbuf_caches[dcs].cache, db); uint64_t db_size = db->db.db_size; size = zfs_refcount_add_many( &dbuf_caches[dcs].size, db_size, db); uint8_t db_level = db->db_level; mutex_exit(&db->db_mtx); if (dcs == DB_DBUF_METADATA_CACHE) { DBUF_STAT_BUMP(metadata_cache_count); DBUF_STAT_MAX( metadata_cache_size_bytes_max, size); } else { DBUF_STAT_BUMP(cache_count); DBUF_STAT_MAX(cache_size_bytes_max, size); DBUF_STAT_BUMP(cache_levels[db_level]); DBUF_STAT_INCR( cache_levels_bytes[db_level], db_size); } if (dcs == DB_DBUF_CACHE && !evicting) dbuf_evict_notify(size); } if (do_arc_evict) arc_freed(spa, &bp); } } else { mutex_exit(&db->db_mtx); } } #pragma weak dmu_buf_refcount = dbuf_refcount uint64_t dbuf_refcount(dmu_buf_impl_t *db) { return (zfs_refcount_count(&db->db_holds)); } uint64_t dmu_buf_user_refcount(dmu_buf_t *db_fake) { uint64_t holds; dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; mutex_enter(&db->db_mtx); ASSERT3U(zfs_refcount_count(&db->db_holds), >=, db->db_dirtycnt); holds = zfs_refcount_count(&db->db_holds) - db->db_dirtycnt; mutex_exit(&db->db_mtx); return (holds); } void * dmu_buf_replace_user(dmu_buf_t *db_fake, dmu_buf_user_t *old_user, dmu_buf_user_t *new_user) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; mutex_enter(&db->db_mtx); dbuf_verify_user(db, DBVU_NOT_EVICTING); if (db->db_user == old_user) db->db_user = new_user; else old_user = db->db_user; dbuf_verify_user(db, DBVU_NOT_EVICTING); mutex_exit(&db->db_mtx); return (old_user); } void * dmu_buf_set_user(dmu_buf_t *db_fake, dmu_buf_user_t *user) { return (dmu_buf_replace_user(db_fake, NULL, user)); } void * dmu_buf_set_user_ie(dmu_buf_t *db_fake, dmu_buf_user_t *user) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; db->db_user_immediate_evict = TRUE; return (dmu_buf_set_user(db_fake, user)); } void * dmu_buf_remove_user(dmu_buf_t *db_fake, dmu_buf_user_t *user) { return (dmu_buf_replace_user(db_fake, user, NULL)); } void * dmu_buf_get_user(dmu_buf_t *db_fake) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dbuf_verify_user(db, DBVU_NOT_EVICTING); return (db->db_user); } void dmu_buf_user_evict_wait(void) { taskq_wait(dbu_evict_taskq); } blkptr_t * dmu_buf_get_blkptr(dmu_buf_t *db) { dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; return (dbi->db_blkptr); } objset_t * dmu_buf_get_objset(dmu_buf_t *db) { dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; return (dbi->db_objset); } dnode_t * dmu_buf_dnode_enter(dmu_buf_t *db) { dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; DB_DNODE_ENTER(dbi); return (DB_DNODE(dbi)); } void dmu_buf_dnode_exit(dmu_buf_t *db) { dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; DB_DNODE_EXIT(dbi); } static void dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db) { /* ASSERT(dmu_tx_is_syncing(tx) */ ASSERT(MUTEX_HELD(&db->db_mtx)); if (db->db_blkptr != NULL) return; if (db->db_blkid == DMU_SPILL_BLKID) { db->db_blkptr = DN_SPILL_BLKPTR(dn->dn_phys); BP_ZERO(db->db_blkptr); return; } if (db->db_level == dn->dn_phys->dn_nlevels-1) { /* * This buffer was allocated at a time when there was * no available blkptrs from the dnode, or it was * inappropriate to hook it in (i.e., nlevels mismatch). */ ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr); ASSERT(db->db_parent == NULL); db->db_parent = dn->dn_dbuf; db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid]; DBUF_VERIFY(db); } else { dmu_buf_impl_t *parent = db->db_parent; int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; ASSERT(dn->dn_phys->dn_nlevels > 1); if (parent == NULL) { mutex_exit(&db->db_mtx); rw_enter(&dn->dn_struct_rwlock, RW_READER); parent = dbuf_hold_level(dn, db->db_level + 1, db->db_blkid >> epbs, db); rw_exit(&dn->dn_struct_rwlock); mutex_enter(&db->db_mtx); db->db_parent = parent; } db->db_blkptr = (blkptr_t *)parent->db.db_data + (db->db_blkid & ((1ULL << epbs) - 1)); DBUF_VERIFY(db); } } static void dbuf_sync_bonus(dbuf_dirty_record_t *dr, dmu_tx_t *tx) { dmu_buf_impl_t *db = dr->dr_dbuf; void *data = dr->dt.dl.dr_data; ASSERT0(db->db_level); ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(db->db_blkid == DMU_BONUS_BLKID); ASSERT(data != NULL); dnode_t *dn = dr->dr_dnode; ASSERT3U(DN_MAX_BONUS_LEN(dn->dn_phys), <=, DN_SLOTS_TO_BONUSLEN(dn->dn_phys->dn_extra_slots + 1)); memcpy(DN_BONUS(dn->dn_phys), data, DN_MAX_BONUS_LEN(dn->dn_phys)); dbuf_sync_leaf_verify_bonus_dnode(dr); dbuf_undirty_bonus(dr); dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg, B_FALSE); } /* * When syncing out a blocks of dnodes, adjust the block to deal with * encryption. Normally, we make sure the block is decrypted before writing * it. If we have crypt params, then we are writing a raw (encrypted) block, * from a raw receive. In this case, set the ARC buf's crypt params so * that the BP will be filled with the correct byteorder, salt, iv, and mac. */ static void dbuf_prepare_encrypted_dnode_leaf(dbuf_dirty_record_t *dr) { int err; dmu_buf_impl_t *db = dr->dr_dbuf; ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT3U(db->db.db_object, ==, DMU_META_DNODE_OBJECT); ASSERT3U(db->db_level, ==, 0); if (!db->db_objset->os_raw_receive && arc_is_encrypted(db->db_buf)) { zbookmark_phys_t zb; /* * Unfortunately, there is currently no mechanism for * syncing context to handle decryption errors. An error * here is only possible if an attacker maliciously * changed a dnode block and updated the associated * checksums going up the block tree. */ SET_BOOKMARK(&zb, dmu_objset_id(db->db_objset), db->db.db_object, db->db_level, db->db_blkid); err = arc_untransform(db->db_buf, db->db_objset->os_spa, &zb, B_TRUE); if (err) panic("Invalid dnode block MAC"); } else if (dr->dt.dl.dr_has_raw_params) { (void) arc_release(dr->dt.dl.dr_data, db); arc_convert_to_raw(dr->dt.dl.dr_data, dmu_objset_id(db->db_objset), dr->dt.dl.dr_byteorder, DMU_OT_DNODE, dr->dt.dl.dr_salt, dr->dt.dl.dr_iv, dr->dt.dl.dr_mac); } } /* * dbuf_sync_indirect() is called recursively from dbuf_sync_list() so it * is critical the we not allow the compiler to inline this function in to * dbuf_sync_list() thereby drastically bloating the stack usage. */ noinline static void dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx) { dmu_buf_impl_t *db = dr->dr_dbuf; dnode_t *dn = dr->dr_dnode; ASSERT(dmu_tx_is_syncing(tx)); dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); mutex_enter(&db->db_mtx); ASSERT(db->db_level > 0); DBUF_VERIFY(db); /* Read the block if it hasn't been read yet. */ if (db->db_buf == NULL) { mutex_exit(&db->db_mtx); (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED); mutex_enter(&db->db_mtx); } ASSERT3U(db->db_state, ==, DB_CACHED); ASSERT(db->db_buf != NULL); /* Indirect block size must match what the dnode thinks it is. */ ASSERT3U(db->db.db_size, ==, 1<dn_phys->dn_indblkshift); dbuf_check_blkptr(dn, db); /* Provide the pending dirty record to child dbufs */ db->db_data_pending = dr; mutex_exit(&db->db_mtx); dbuf_write(dr, db->db_buf, tx); zio_t *zio = dr->dr_zio; mutex_enter(&dr->dt.di.dr_mtx); dbuf_sync_list(&dr->dt.di.dr_children, db->db_level - 1, tx); ASSERT(list_head(&dr->dt.di.dr_children) == NULL); mutex_exit(&dr->dt.di.dr_mtx); zio_nowait(zio); } /* * Verify that the size of the data in our bonus buffer does not exceed * its recorded size. * * The purpose of this verification is to catch any cases in development * where the size of a phys structure (i.e space_map_phys_t) grows and, * due to incorrect feature management, older pools expect to read more * data even though they didn't actually write it to begin with. * * For a example, this would catch an error in the feature logic where we * open an older pool and we expect to write the space map histogram of * a space map with size SPACE_MAP_SIZE_V0. */ static void dbuf_sync_leaf_verify_bonus_dnode(dbuf_dirty_record_t *dr) { #ifdef ZFS_DEBUG dnode_t *dn = dr->dr_dnode; /* * Encrypted bonus buffers can have data past their bonuslen. * Skip the verification of these blocks. */ if (DMU_OT_IS_ENCRYPTED(dn->dn_bonustype)) return; uint16_t bonuslen = dn->dn_phys->dn_bonuslen; uint16_t maxbonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots); ASSERT3U(bonuslen, <=, maxbonuslen); arc_buf_t *datap = dr->dt.dl.dr_data; char *datap_end = ((char *)datap) + bonuslen; char *datap_max = ((char *)datap) + maxbonuslen; /* ensure that everything is zero after our data */ for (; datap_end < datap_max; datap_end++) ASSERT(*datap_end == 0); #endif } static blkptr_t * dbuf_lightweight_bp(dbuf_dirty_record_t *dr) { /* This must be a lightweight dirty record. */ ASSERT3P(dr->dr_dbuf, ==, NULL); dnode_t *dn = dr->dr_dnode; if (dn->dn_phys->dn_nlevels == 1) { VERIFY3U(dr->dt.dll.dr_blkid, <, dn->dn_phys->dn_nblkptr); return (&dn->dn_phys->dn_blkptr[dr->dt.dll.dr_blkid]); } else { dmu_buf_impl_t *parent_db = dr->dr_parent->dr_dbuf; int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; VERIFY3U(parent_db->db_level, ==, 1); VERIFY3P(parent_db->db_dnode_handle->dnh_dnode, ==, dn); VERIFY3U(dr->dt.dll.dr_blkid >> epbs, ==, parent_db->db_blkid); blkptr_t *bp = parent_db->db.db_data; return (&bp[dr->dt.dll.dr_blkid & ((1 << epbs) - 1)]); } } static void dbuf_lightweight_ready(zio_t *zio) { dbuf_dirty_record_t *dr = zio->io_private; blkptr_t *bp = zio->io_bp; if (zio->io_error != 0) return; dnode_t *dn = dr->dr_dnode; blkptr_t *bp_orig = dbuf_lightweight_bp(dr); spa_t *spa = dmu_objset_spa(dn->dn_objset); int64_t delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig); dnode_diduse_space(dn, delta); uint64_t blkid = dr->dt.dll.dr_blkid; mutex_enter(&dn->dn_mtx); if (blkid > dn->dn_phys->dn_maxblkid) { ASSERT0(dn->dn_objset->os_raw_receive); dn->dn_phys->dn_maxblkid = blkid; } mutex_exit(&dn->dn_mtx); if (!BP_IS_EMBEDDED(bp)) { uint64_t fill = BP_IS_HOLE(bp) ? 0 : 1; BP_SET_FILL(bp, fill); } dmu_buf_impl_t *parent_db; EQUIV(dr->dr_parent == NULL, dn->dn_phys->dn_nlevels == 1); if (dr->dr_parent == NULL) { parent_db = dn->dn_dbuf; } else { parent_db = dr->dr_parent->dr_dbuf; } rw_enter(&parent_db->db_rwlock, RW_WRITER); *bp_orig = *bp; rw_exit(&parent_db->db_rwlock); } static void dbuf_lightweight_physdone(zio_t *zio) { dbuf_dirty_record_t *dr = zio->io_private; dsl_pool_t *dp = spa_get_dsl(zio->io_spa); ASSERT3U(dr->dr_txg, ==, zio->io_txg); /* * The callback will be called io_phys_children times. Retire one * portion of our dirty space each time we are called. Any rounding * error will be cleaned up by dbuf_lightweight_done(). */ int delta = dr->dr_accounted / zio->io_phys_children; dsl_pool_undirty_space(dp, delta, zio->io_txg); } static void dbuf_lightweight_done(zio_t *zio) { dbuf_dirty_record_t *dr = zio->io_private; VERIFY0(zio->io_error); objset_t *os = dr->dr_dnode->dn_objset; dmu_tx_t *tx = os->os_synctx; if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) { ASSERT(BP_EQUAL(zio->io_bp, &zio->io_bp_orig)); } else { dsl_dataset_t *ds = os->os_dsl_dataset; (void) dsl_dataset_block_kill(ds, &zio->io_bp_orig, tx, B_TRUE); dsl_dataset_block_born(ds, zio->io_bp, tx); } /* * See comment in dbuf_write_done(). */ if (zio->io_phys_children == 0) { dsl_pool_undirty_space(dmu_objset_pool(os), dr->dr_accounted, zio->io_txg); } else { dsl_pool_undirty_space(dmu_objset_pool(os), dr->dr_accounted % zio->io_phys_children, zio->io_txg); } abd_free(dr->dt.dll.dr_abd); kmem_free(dr, sizeof (*dr)); } noinline static void dbuf_sync_lightweight(dbuf_dirty_record_t *dr, dmu_tx_t *tx) { dnode_t *dn = dr->dr_dnode; zio_t *pio; if (dn->dn_phys->dn_nlevels == 1) { pio = dn->dn_zio; } else { pio = dr->dr_parent->dr_zio; } zbookmark_phys_t zb = { .zb_objset = dmu_objset_id(dn->dn_objset), .zb_object = dn->dn_object, .zb_level = 0, .zb_blkid = dr->dt.dll.dr_blkid, }; /* * See comment in dbuf_write(). This is so that zio->io_bp_orig * will have the old BP in dbuf_lightweight_done(). */ dr->dr_bp_copy = *dbuf_lightweight_bp(dr); dr->dr_zio = zio_write(pio, dmu_objset_spa(dn->dn_objset), dmu_tx_get_txg(tx), &dr->dr_bp_copy, dr->dt.dll.dr_abd, dn->dn_datablksz, abd_get_size(dr->dt.dll.dr_abd), &dr->dt.dll.dr_props, dbuf_lightweight_ready, NULL, dbuf_lightweight_physdone, dbuf_lightweight_done, dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED | dr->dt.dll.dr_flags, &zb); zio_nowait(dr->dr_zio); } /* * dbuf_sync_leaf() is called recursively from dbuf_sync_list() so it is * critical the we not allow the compiler to inline this function in to * dbuf_sync_list() thereby drastically bloating the stack usage. */ noinline static void dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) { arc_buf_t **datap = &dr->dt.dl.dr_data; dmu_buf_impl_t *db = dr->dr_dbuf; dnode_t *dn = dr->dr_dnode; objset_t *os; uint64_t txg = tx->tx_txg; ASSERT(dmu_tx_is_syncing(tx)); dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); mutex_enter(&db->db_mtx); /* * To be synced, we must be dirtied. But we * might have been freed after the dirty. */ if (db->db_state == DB_UNCACHED) { /* This buffer has been freed since it was dirtied */ ASSERT(db->db.db_data == NULL); } else if (db->db_state == DB_FILL) { /* This buffer was freed and is now being re-filled */ ASSERT(db->db.db_data != dr->dt.dl.dr_data); } else { ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL); } DBUF_VERIFY(db); if (db->db_blkid == DMU_SPILL_BLKID) { mutex_enter(&dn->dn_mtx); if (!(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) { /* * In the previous transaction group, the bonus buffer * was entirely used to store the attributes for the * dnode which overrode the dn_spill field. However, * when adding more attributes to the file a spill * block was required to hold the extra attributes. * * Make sure to clear the garbage left in the dn_spill * field from the previous attributes in the bonus * buffer. Otherwise, after writing out the spill * block to the new allocated dva, it will free * the old block pointed to by the invalid dn_spill. */ db->db_blkptr = NULL; } dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR; mutex_exit(&dn->dn_mtx); } /* * If this is a bonus buffer, simply copy the bonus data into the * dnode. It will be written out when the dnode is synced (and it * will be synced, since it must have been dirty for dbuf_sync to * be called). */ if (db->db_blkid == DMU_BONUS_BLKID) { ASSERT(dr->dr_dbuf == db); dbuf_sync_bonus(dr, tx); return; } os = dn->dn_objset; /* * This function may have dropped the db_mtx lock allowing a dmu_sync * operation to sneak in. As a result, we need to ensure that we * don't check the dr_override_state until we have returned from * dbuf_check_blkptr. */ dbuf_check_blkptr(dn, db); /* * If this buffer is in the middle of an immediate write, * wait for the synchronous IO to complete. */ while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) { ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); cv_wait(&db->db_changed, &db->db_mtx); ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN); } /* * If this is a dnode block, ensure it is appropriately encrypted * or decrypted, depending on what we are writing to it this txg. */ if (os->os_encrypted && dn->dn_object == DMU_META_DNODE_OBJECT) dbuf_prepare_encrypted_dnode_leaf(dr); if (db->db_state != DB_NOFILL && dn->dn_object != DMU_META_DNODE_OBJECT && zfs_refcount_count(&db->db_holds) > 1 && dr->dt.dl.dr_override_state != DR_OVERRIDDEN && *datap == db->db_buf) { /* * If this buffer is currently "in use" (i.e., there * are active holds and db_data still references it), * then make a copy before we start the write so that * any modifications from the open txg will not leak * into this write. * * NOTE: this copy does not need to be made for * objects only modified in the syncing context (e.g. * DNONE_DNODE blocks). */ int psize = arc_buf_size(*datap); int lsize = arc_buf_lsize(*datap); arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); enum zio_compress compress_type = arc_get_compression(*datap); uint8_t complevel = arc_get_complevel(*datap); if (arc_is_encrypted(*datap)) { boolean_t byteorder; uint8_t salt[ZIO_DATA_SALT_LEN]; uint8_t iv[ZIO_DATA_IV_LEN]; uint8_t mac[ZIO_DATA_MAC_LEN]; arc_get_raw_params(*datap, &byteorder, salt, iv, mac); *datap = arc_alloc_raw_buf(os->os_spa, db, dmu_objset_id(os), byteorder, salt, iv, mac, dn->dn_type, psize, lsize, compress_type, complevel); } else if (compress_type != ZIO_COMPRESS_OFF) { ASSERT3U(type, ==, ARC_BUFC_DATA); *datap = arc_alloc_compressed_buf(os->os_spa, db, psize, lsize, compress_type, complevel); } else { *datap = arc_alloc_buf(os->os_spa, db, type, psize); } memcpy((*datap)->b_data, db->db.db_data, psize); } db->db_data_pending = dr; mutex_exit(&db->db_mtx); dbuf_write(dr, *datap, tx); ASSERT(!list_link_active(&dr->dr_dirty_node)); if (dn->dn_object == DMU_META_DNODE_OBJECT) { list_insert_tail(&dn->dn_dirty_records[txg & TXG_MASK], dr); } else { zio_nowait(dr->dr_zio); } } void dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx) { dbuf_dirty_record_t *dr; while ((dr = list_head(list))) { if (dr->dr_zio != NULL) { /* * If we find an already initialized zio then we * are processing the meta-dnode, and we have finished. * The dbufs for all dnodes are put back on the list * during processing, so that we can zio_wait() * these IOs after initiating all child IOs. */ ASSERT3U(dr->dr_dbuf->db.db_object, ==, DMU_META_DNODE_OBJECT); break; } list_remove(list, dr); if (dr->dr_dbuf == NULL) { dbuf_sync_lightweight(dr, tx); } else { if (dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID && dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) { VERIFY3U(dr->dr_dbuf->db_level, ==, level); } if (dr->dr_dbuf->db_level > 0) dbuf_sync_indirect(dr, tx); else dbuf_sync_leaf(dr, tx); } } } static void dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) { (void) buf; dmu_buf_impl_t *db = vdb; dnode_t *dn; blkptr_t *bp = zio->io_bp; blkptr_t *bp_orig = &zio->io_bp_orig; spa_t *spa = zio->io_spa; int64_t delta; uint64_t fill = 0; int i; ASSERT3P(db->db_blkptr, !=, NULL); ASSERT3P(&db->db_data_pending->dr_bp_copy, ==, bp); DB_DNODE_ENTER(db); dn = DB_DNODE(db); delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig); dnode_diduse_space(dn, delta - zio->io_prev_space_delta); zio->io_prev_space_delta = delta; if (bp->blk_birth != 0) { ASSERT((db->db_blkid != DMU_SPILL_BLKID && BP_GET_TYPE(bp) == dn->dn_type) || (db->db_blkid == DMU_SPILL_BLKID && BP_GET_TYPE(bp) == dn->dn_bonustype) || BP_IS_EMBEDDED(bp)); ASSERT(BP_GET_LEVEL(bp) == db->db_level); } mutex_enter(&db->db_mtx); #ifdef ZFS_DEBUG if (db->db_blkid == DMU_SPILL_BLKID) { ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); ASSERT(!(BP_IS_HOLE(bp)) && db->db_blkptr == DN_SPILL_BLKPTR(dn->dn_phys)); } #endif if (db->db_level == 0) { mutex_enter(&dn->dn_mtx); if (db->db_blkid > dn->dn_phys->dn_maxblkid && db->db_blkid != DMU_SPILL_BLKID) { ASSERT0(db->db_objset->os_raw_receive); dn->dn_phys->dn_maxblkid = db->db_blkid; } mutex_exit(&dn->dn_mtx); if (dn->dn_type == DMU_OT_DNODE) { i = 0; while (i < db->db.db_size) { dnode_phys_t *dnp = (void *)(((char *)db->db.db_data) + i); i += DNODE_MIN_SIZE; if (dnp->dn_type != DMU_OT_NONE) { fill++; i += dnp->dn_extra_slots * DNODE_MIN_SIZE; } } } else { if (BP_IS_HOLE(bp)) { fill = 0; } else { fill = 1; } } } else { blkptr_t *ibp = db->db.db_data; ASSERT3U(db->db.db_size, ==, 1<dn_phys->dn_indblkshift); for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) { if (BP_IS_HOLE(ibp)) continue; fill += BP_GET_FILL(ibp); } } DB_DNODE_EXIT(db); if (!BP_IS_EMBEDDED(bp)) BP_SET_FILL(bp, fill); mutex_exit(&db->db_mtx); db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_WRITER, FTAG); *db->db_blkptr = *bp; dmu_buf_unlock_parent(db, dblt, FTAG); } /* * This function gets called just prior to running through the compression * stage of the zio pipeline. If we're an indirect block comprised of only * holes, then we want this indirect to be compressed away to a hole. In * order to do that we must zero out any information about the holes that * this indirect points to prior to before we try to compress it. */ static void dbuf_write_children_ready(zio_t *zio, arc_buf_t *buf, void *vdb) { (void) zio, (void) buf; dmu_buf_impl_t *db = vdb; dnode_t *dn; blkptr_t *bp; unsigned int epbs, i; ASSERT3U(db->db_level, >, 0); DB_DNODE_ENTER(db); dn = DB_DNODE(db); epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; ASSERT3U(epbs, <, 31); /* Determine if all our children are holes */ for (i = 0, bp = db->db.db_data; i < 1ULL << epbs; i++, bp++) { if (!BP_IS_HOLE(bp)) break; } /* * If all the children are holes, then zero them all out so that * we may get compressed away. */ if (i == 1ULL << epbs) { /* * We only found holes. Grab the rwlock to prevent * anybody from reading the blocks we're about to * zero out. */ rw_enter(&db->db_rwlock, RW_WRITER); memset(db->db.db_data, 0, db->db.db_size); rw_exit(&db->db_rwlock); } DB_DNODE_EXIT(db); } /* * The SPA will call this callback several times for each zio - once * for every physical child i/o (zio->io_phys_children times). This * allows the DMU to monitor the progress of each logical i/o. For example, * there may be 2 copies of an indirect block, or many fragments of a RAID-Z * block. There may be a long delay before all copies/fragments are completed, * so this callback allows us to retire dirty space gradually, as the physical * i/os complete. */ static void dbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg) { (void) buf; dmu_buf_impl_t *db = arg; objset_t *os = db->db_objset; dsl_pool_t *dp = dmu_objset_pool(os); dbuf_dirty_record_t *dr; int delta = 0; dr = db->db_data_pending; ASSERT3U(dr->dr_txg, ==, zio->io_txg); /* * The callback will be called io_phys_children times. Retire one * portion of our dirty space each time we are called. Any rounding * error will be cleaned up by dbuf_write_done(). */ delta = dr->dr_accounted / zio->io_phys_children; dsl_pool_undirty_space(dp, delta, zio->io_txg); } static void dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) { (void) buf; dmu_buf_impl_t *db = vdb; blkptr_t *bp_orig = &zio->io_bp_orig; blkptr_t *bp = db->db_blkptr; objset_t *os = db->db_objset; dmu_tx_t *tx = os->os_synctx; ASSERT0(zio->io_error); ASSERT(db->db_blkptr == bp); /* * For nopwrites and rewrites we ensure that the bp matches our * original and bypass all the accounting. */ if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) { ASSERT(BP_EQUAL(bp, bp_orig)); } else { dsl_dataset_t *ds = os->os_dsl_dataset; (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE); dsl_dataset_block_born(ds, bp, tx); } mutex_enter(&db->db_mtx); DBUF_VERIFY(db); dbuf_dirty_record_t *dr = db->db_data_pending; dnode_t *dn = dr->dr_dnode; ASSERT(!list_link_active(&dr->dr_dirty_node)); ASSERT(dr->dr_dbuf == db); ASSERT(list_next(&db->db_dirty_records, dr) == NULL); list_remove(&db->db_dirty_records, dr); #ifdef ZFS_DEBUG if (db->db_blkid == DMU_SPILL_BLKID) { ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); ASSERT(!(BP_IS_HOLE(db->db_blkptr)) && db->db_blkptr == DN_SPILL_BLKPTR(dn->dn_phys)); } #endif if (db->db_level == 0) { ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); if (db->db_state != DB_NOFILL) { if (dr->dt.dl.dr_data != db->db_buf) arc_buf_destroy(dr->dt.dl.dr_data, db); } } else { ASSERT(list_head(&dr->dt.di.dr_children) == NULL); ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift); if (!BP_IS_HOLE(db->db_blkptr)) { int epbs __maybe_unused = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; ASSERT3U(db->db_blkid, <=, dn->dn_phys->dn_maxblkid >> (db->db_level * epbs)); ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, db->db.db_size); } mutex_destroy(&dr->dt.di.dr_mtx); list_destroy(&dr->dt.di.dr_children); } cv_broadcast(&db->db_changed); ASSERT(db->db_dirtycnt > 0); db->db_dirtycnt -= 1; db->db_data_pending = NULL; dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg, B_FALSE); /* * If we didn't do a physical write in this ZIO and we * still ended up here, it means that the space of the * dbuf that we just released (and undirtied) above hasn't * been marked as undirtied in the pool's accounting. * * Thus, we undirty that space in the pool's view of the * world here. For physical writes this type of update * happens in dbuf_write_physdone(). * * If we did a physical write, cleanup any rounding errors * that came up due to writing multiple copies of a block * on disk [see dbuf_write_physdone()]. */ if (zio->io_phys_children == 0) { dsl_pool_undirty_space(dmu_objset_pool(os), dr->dr_accounted, zio->io_txg); } else { dsl_pool_undirty_space(dmu_objset_pool(os), dr->dr_accounted % zio->io_phys_children, zio->io_txg); } kmem_free(dr, sizeof (dbuf_dirty_record_t)); } static void dbuf_write_nofill_ready(zio_t *zio) { dbuf_write_ready(zio, NULL, zio->io_private); } static void dbuf_write_nofill_done(zio_t *zio) { dbuf_write_done(zio, NULL, zio->io_private); } static void dbuf_write_override_ready(zio_t *zio) { dbuf_dirty_record_t *dr = zio->io_private; dmu_buf_impl_t *db = dr->dr_dbuf; dbuf_write_ready(zio, NULL, db); } static void dbuf_write_override_done(zio_t *zio) { dbuf_dirty_record_t *dr = zio->io_private; dmu_buf_impl_t *db = dr->dr_dbuf; blkptr_t *obp = &dr->dt.dl.dr_overridden_by; mutex_enter(&db->db_mtx); if (!BP_EQUAL(zio->io_bp, obp)) { if (!BP_IS_HOLE(obp)) dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp); arc_release(dr->dt.dl.dr_data, db); } mutex_exit(&db->db_mtx); dbuf_write_done(zio, NULL, db); if (zio->io_abd != NULL) abd_free(zio->io_abd); } typedef struct dbuf_remap_impl_callback_arg { objset_t *drica_os; uint64_t drica_blk_birth; dmu_tx_t *drica_tx; } dbuf_remap_impl_callback_arg_t; static void dbuf_remap_impl_callback(uint64_t vdev, uint64_t offset, uint64_t size, void *arg) { dbuf_remap_impl_callback_arg_t *drica = arg; objset_t *os = drica->drica_os; spa_t *spa = dmu_objset_spa(os); dmu_tx_t *tx = drica->drica_tx; ASSERT(dsl_pool_sync_context(spa_get_dsl(spa))); if (os == spa_meta_objset(spa)) { spa_vdev_indirect_mark_obsolete(spa, vdev, offset, size, tx); } else { dsl_dataset_block_remapped(dmu_objset_ds(os), vdev, offset, size, drica->drica_blk_birth, tx); } } static void dbuf_remap_impl(dnode_t *dn, blkptr_t *bp, krwlock_t *rw, dmu_tx_t *tx) { blkptr_t bp_copy = *bp; spa_t *spa = dmu_objset_spa(dn->dn_objset); dbuf_remap_impl_callback_arg_t drica; ASSERT(dsl_pool_sync_context(spa_get_dsl(spa))); drica.drica_os = dn->dn_objset; drica.drica_blk_birth = bp->blk_birth; drica.drica_tx = tx; if (spa_remap_blkptr(spa, &bp_copy, dbuf_remap_impl_callback, &drica)) { /* * If the blkptr being remapped is tracked by a livelist, * then we need to make sure the livelist reflects the update. * First, cancel out the old blkptr by appending a 'FREE' * entry. Next, add an 'ALLOC' to track the new version. This * way we avoid trying to free an inaccurate blkptr at delete. * Note that embedded blkptrs are not tracked in livelists. */ if (dn->dn_objset != spa_meta_objset(spa)) { dsl_dataset_t *ds = dmu_objset_ds(dn->dn_objset); if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) && bp->blk_birth > ds->ds_dir->dd_origin_txg) { ASSERT(!BP_IS_EMBEDDED(bp)); ASSERT(dsl_dir_is_clone(ds->ds_dir)); ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_LIVELIST)); bplist_append(&ds->ds_dir->dd_pending_frees, bp); bplist_append(&ds->ds_dir->dd_pending_allocs, &bp_copy); } } /* * The db_rwlock prevents dbuf_read_impl() from * dereferencing the BP while we are changing it. To * avoid lock contention, only grab it when we are actually * changing the BP. */ if (rw != NULL) rw_enter(rw, RW_WRITER); *bp = bp_copy; if (rw != NULL) rw_exit(rw); } } /* * Remap any existing BP's to concrete vdevs, if possible. */ static void dbuf_remap(dnode_t *dn, dmu_buf_impl_t *db, dmu_tx_t *tx) { spa_t *spa = dmu_objset_spa(db->db_objset); ASSERT(dsl_pool_sync_context(spa_get_dsl(spa))); if (!spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL)) return; if (db->db_level > 0) { blkptr_t *bp = db->db.db_data; for (int i = 0; i < db->db.db_size >> SPA_BLKPTRSHIFT; i++) { dbuf_remap_impl(dn, &bp[i], &db->db_rwlock, tx); } } else if (db->db.db_object == DMU_META_DNODE_OBJECT) { dnode_phys_t *dnp = db->db.db_data; ASSERT3U(db->db_dnode_handle->dnh_dnode->dn_type, ==, DMU_OT_DNODE); for (int i = 0; i < db->db.db_size >> DNODE_SHIFT; i += dnp[i].dn_extra_slots + 1) { for (int j = 0; j < dnp[i].dn_nblkptr; j++) { krwlock_t *lock = (dn->dn_dbuf == NULL ? NULL : &dn->dn_dbuf->db_rwlock); dbuf_remap_impl(dn, &dnp[i].dn_blkptr[j], lock, tx); } } } } /* Issue I/O to commit a dirty buffer to disk. */ static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) { dmu_buf_impl_t *db = dr->dr_dbuf; dnode_t *dn = dr->dr_dnode; objset_t *os; dmu_buf_impl_t *parent = db->db_parent; uint64_t txg = tx->tx_txg; zbookmark_phys_t zb; zio_prop_t zp; zio_t *pio; /* parent I/O */ int wp_flag = 0; ASSERT(dmu_tx_is_syncing(tx)); os = dn->dn_objset; if (db->db_state != DB_NOFILL) { if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) { /* * Private object buffers are released here rather * than in dbuf_dirty() since they are only modified * in the syncing context and we don't want the * overhead of making multiple copies of the data. */ if (BP_IS_HOLE(db->db_blkptr)) { arc_buf_thaw(data); } else { dbuf_release_bp(db); } dbuf_remap(dn, db, tx); } } if (parent != dn->dn_dbuf) { /* Our parent is an indirect block. */ /* We have a dirty parent that has been scheduled for write. */ ASSERT(parent && parent->db_data_pending); /* Our parent's buffer is one level closer to the dnode. */ ASSERT(db->db_level == parent->db_level-1); /* * We're about to modify our parent's db_data by modifying * our block pointer, so the parent must be released. */ ASSERT(arc_released(parent->db_buf)); pio = parent->db_data_pending->dr_zio; } else { /* Our parent is the dnode itself. */ ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 && db->db_blkid != DMU_SPILL_BLKID) || (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0)); if (db->db_blkid != DMU_SPILL_BLKID) ASSERT3P(db->db_blkptr, ==, &dn->dn_phys->dn_blkptr[db->db_blkid]); pio = dn->dn_zio; } ASSERT(db->db_level == 0 || data == db->db_buf); ASSERT3U(db->db_blkptr->blk_birth, <=, txg); ASSERT(pio); SET_BOOKMARK(&zb, os->os_dsl_dataset ? os->os_dsl_dataset->ds_object : DMU_META_OBJSET, db->db.db_object, db->db_level, db->db_blkid); if (db->db_blkid == DMU_SPILL_BLKID) wp_flag = WP_SPILL; wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0; dmu_write_policy(os, dn, db->db_level, wp_flag, &zp); /* * We copy the blkptr now (rather than when we instantiate the dirty * record), because its value can change between open context and * syncing context. We do not need to hold dn_struct_rwlock to read * db_blkptr because we are in syncing context. */ dr->dr_bp_copy = *db->db_blkptr; if (db->db_level == 0 && dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { /* * The BP for this block has been provided by open context * (by dmu_sync() or dmu_buf_write_embedded()). */ abd_t *contents = (data != NULL) ? abd_get_from_buf(data->b_data, arc_buf_size(data)) : NULL; dr->dr_zio = zio_write(pio, os->os_spa, txg, &dr->dr_bp_copy, contents, db->db.db_size, db->db.db_size, &zp, dbuf_write_override_ready, NULL, NULL, dbuf_write_override_done, dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); mutex_enter(&db->db_mtx); dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by, dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite); mutex_exit(&db->db_mtx); } else if (db->db_state == DB_NOFILL) { ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF || zp.zp_checksum == ZIO_CHECKSUM_NOPARITY); dr->dr_zio = zio_write(pio, os->os_spa, txg, &dr->dr_bp_copy, NULL, db->db.db_size, db->db.db_size, &zp, dbuf_write_nofill_ready, NULL, NULL, dbuf_write_nofill_done, db, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb); } else { ASSERT(arc_released(data)); /* * For indirect blocks, we want to setup the children * ready callback so that we can properly handle an indirect * block that only contains holes. */ arc_write_done_func_t *children_ready_cb = NULL; if (db->db_level != 0) children_ready_cb = dbuf_write_children_ready; dr->dr_zio = arc_write(pio, os->os_spa, txg, &dr->dr_bp_copy, data, dbuf_is_l2cacheable(db), &zp, dbuf_write_ready, children_ready_cb, dbuf_write_physdone, dbuf_write_done, db, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); } } EXPORT_SYMBOL(dbuf_find); EXPORT_SYMBOL(dbuf_is_metadata); EXPORT_SYMBOL(dbuf_destroy); EXPORT_SYMBOL(dbuf_loan_arcbuf); EXPORT_SYMBOL(dbuf_whichblock); EXPORT_SYMBOL(dbuf_read); EXPORT_SYMBOL(dbuf_unoverride); EXPORT_SYMBOL(dbuf_free_range); EXPORT_SYMBOL(dbuf_new_size); EXPORT_SYMBOL(dbuf_release_bp); EXPORT_SYMBOL(dbuf_dirty); EXPORT_SYMBOL(dmu_buf_set_crypt_params); EXPORT_SYMBOL(dmu_buf_will_dirty); EXPORT_SYMBOL(dmu_buf_is_dirty); EXPORT_SYMBOL(dmu_buf_will_not_fill); EXPORT_SYMBOL(dmu_buf_will_fill); EXPORT_SYMBOL(dmu_buf_fill_done); EXPORT_SYMBOL(dmu_buf_rele); EXPORT_SYMBOL(dbuf_assign_arcbuf); EXPORT_SYMBOL(dbuf_prefetch); EXPORT_SYMBOL(dbuf_hold_impl); EXPORT_SYMBOL(dbuf_hold); EXPORT_SYMBOL(dbuf_hold_level); EXPORT_SYMBOL(dbuf_create_bonus); EXPORT_SYMBOL(dbuf_spill_set_blksz); EXPORT_SYMBOL(dbuf_rm_spill); EXPORT_SYMBOL(dbuf_add_ref); EXPORT_SYMBOL(dbuf_rele); EXPORT_SYMBOL(dbuf_rele_and_unlock); EXPORT_SYMBOL(dbuf_refcount); EXPORT_SYMBOL(dbuf_sync_list); EXPORT_SYMBOL(dmu_buf_set_user); EXPORT_SYMBOL(dmu_buf_set_user_ie); EXPORT_SYMBOL(dmu_buf_get_user); EXPORT_SYMBOL(dmu_buf_get_blkptr); ZFS_MODULE_PARAM(zfs_dbuf_cache, dbuf_cache_, max_bytes, U64, ZMOD_RW, "Maximum size in bytes of the dbuf cache."); ZFS_MODULE_PARAM(zfs_dbuf_cache, dbuf_cache_, hiwater_pct, UINT, ZMOD_RW, "Percentage over dbuf_cache_max_bytes for direct dbuf eviction."); ZFS_MODULE_PARAM(zfs_dbuf_cache, dbuf_cache_, lowater_pct, UINT, ZMOD_RW, "Percentage below dbuf_cache_max_bytes when dbuf eviction stops."); ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, metadata_cache_max_bytes, U64, ZMOD_RW, "Maximum size in bytes of dbuf metadata cache."); ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, cache_shift, UINT, ZMOD_RW, "Set size of dbuf cache to log2 fraction of arc size."); ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, metadata_cache_shift, UINT, ZMOD_RW, "Set size of dbuf metadata cache to log2 fraction of arc size."); ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, mutex_cache_shift, UINT, ZMOD_RD, "Set size of dbuf cache mutex array as log2 shift."); diff --git a/module/zfs/dsl_bookmark.c b/module/zfs/dsl_bookmark.c index 8ca7ba8957aa..b95c94beff1f 100644 --- a/module/zfs/dsl_bookmark.c +++ b/module/zfs/dsl_bookmark.c @@ -1,1733 +1,1732 @@ /* * CDDL HEADER START * * This file and its contents are supplied under the terms of the * Common Development and Distribution License ("CDDL"), version 1.0. * You may only use this file in accordance with the terms of version * 1.0 of the CDDL. * * A full copy of the text of the CDDL should have accompanied this * source. A copy of the CDDL is also available via the Internet at * http://www.illumos.org/license/CDDL. * * CDDL HEADER END */ /* * Copyright (c) 2013, 2018 by Delphix. All rights reserved. * Copyright 2017 Nexenta Systems, Inc. * Copyright 2019, 2020 by Christian Schwarz. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int dsl_bookmark_hold_ds(dsl_pool_t *dp, const char *fullname, dsl_dataset_t **dsp, const void *tag, char **shortnamep) { char buf[ZFS_MAX_DATASET_NAME_LEN]; char *hashp; if (strlen(fullname) >= ZFS_MAX_DATASET_NAME_LEN) return (SET_ERROR(ENAMETOOLONG)); hashp = strchr(fullname, '#'); if (hashp == NULL) return (SET_ERROR(EINVAL)); *shortnamep = hashp + 1; if (zfs_component_namecheck(*shortnamep, NULL, NULL)) return (SET_ERROR(EINVAL)); (void) strlcpy(buf, fullname, hashp - fullname + 1); return (dsl_dataset_hold(dp, buf, tag, dsp)); } /* * When reading BOOKMARK_V1 bookmarks, the BOOKMARK_V2 fields are guaranteed * to be zeroed. * * Returns ESRCH if bookmark is not found. * Note, we need to use the ZAP rather than the AVL to look up bookmarks * by name, because only the ZAP honors the casesensitivity setting. */ int dsl_bookmark_lookup_impl(dsl_dataset_t *ds, const char *shortname, zfs_bookmark_phys_t *bmark_phys) { objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; uint64_t bmark_zapobj = ds->ds_bookmarks_obj; matchtype_t mt = 0; int err; if (bmark_zapobj == 0) return (SET_ERROR(ESRCH)); if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET) mt = MT_NORMALIZE; /* * Zero out the bookmark in case the one stored on disk * is in an older, shorter format. */ memset(bmark_phys, 0, sizeof (*bmark_phys)); err = zap_lookup_norm(mos, bmark_zapobj, shortname, sizeof (uint64_t), sizeof (*bmark_phys) / sizeof (uint64_t), bmark_phys, mt, NULL, 0, NULL); return (err == ENOENT ? SET_ERROR(ESRCH) : err); } /* * If later_ds is non-NULL, this will return EXDEV if the specified bookmark * does not represents an earlier point in later_ds's timeline. However, * bmp will still be filled in if we return EXDEV. * * Returns ENOENT if the dataset containing the bookmark does not exist. * Returns ESRCH if the dataset exists but the bookmark was not found in it. */ int dsl_bookmark_lookup(dsl_pool_t *dp, const char *fullname, dsl_dataset_t *later_ds, zfs_bookmark_phys_t *bmp) { char *shortname; dsl_dataset_t *ds; int error; error = dsl_bookmark_hold_ds(dp, fullname, &ds, FTAG, &shortname); if (error != 0) return (error); error = dsl_bookmark_lookup_impl(ds, shortname, bmp); if (error == 0 && later_ds != NULL) { if (!dsl_dataset_is_before(later_ds, ds, bmp->zbm_creation_txg)) error = SET_ERROR(EXDEV); } dsl_dataset_rele(ds, FTAG); return (error); } /* * Validates that * - bmark is a full dataset path of a bookmark (bookmark_namecheck) * - source is a full path of a snapshot or bookmark * ({bookmark,snapshot}_namecheck) * * Returns 0 if valid, -1 otherwise. */ static int dsl_bookmark_create_nvl_validate_pair(const char *bmark, const char *source) { if (bookmark_namecheck(bmark, NULL, NULL) != 0) return (-1); int is_bmark, is_snap; is_bmark = bookmark_namecheck(source, NULL, NULL) == 0; is_snap = snapshot_namecheck(source, NULL, NULL) == 0; if (!is_bmark && !is_snap) return (-1); return (0); } /* * Check that the given nvlist corresponds to the following schema: * { newbookmark -> source, ... } * where * - each pair passes dsl_bookmark_create_nvl_validate_pair * - all newbookmarks are in the same pool * - all newbookmarks have unique names * * Note that this function is only validates above schema. Callers must ensure * that the bookmarks can be created, e.g. that sources exist. * * Returns 0 if the nvlist adheres to above schema. * Returns -1 if it doesn't. */ int dsl_bookmark_create_nvl_validate(nvlist_t *bmarks) { char *first = NULL; size_t first_len = 0; for (nvpair_t *pair = nvlist_next_nvpair(bmarks, NULL); pair != NULL; pair = nvlist_next_nvpair(bmarks, pair)) { char *bmark = nvpair_name(pair); char *source; /* list structure: values must be snapshots XOR bookmarks */ if (nvpair_value_string(pair, &source) != 0) return (-1); if (dsl_bookmark_create_nvl_validate_pair(bmark, source) != 0) return (-1); /* same pool check */ if (first == NULL) { char *cp = strpbrk(bmark, "/#"); if (cp == NULL) return (-1); first = bmark; first_len = cp - bmark; } if (strncmp(first, bmark, first_len) != 0) return (-1); switch (*(bmark + first_len)) { case '/': /* fallthrough */ case '#': break; default: return (-1); } /* unique newbookmark names; todo: O(n^2) */ for (nvpair_t *pair2 = nvlist_next_nvpair(bmarks, pair); pair2 != NULL; pair2 = nvlist_next_nvpair(bmarks, pair2)) { if (strcmp(nvpair_name(pair), nvpair_name(pair2)) == 0) return (-1); } } return (0); } /* * expects that newbm and source have been validated using * dsl_bookmark_create_nvl_validate_pair */ static int dsl_bookmark_create_check_impl(dsl_pool_t *dp, const char *newbm, const char *source) { ASSERT0(dsl_bookmark_create_nvl_validate_pair(newbm, source)); /* defer source namecheck until we know it's a snapshot or bookmark */ int error; dsl_dataset_t *newbm_ds; char *newbm_short; zfs_bookmark_phys_t bmark_phys; error = dsl_bookmark_hold_ds(dp, newbm, &newbm_ds, FTAG, &newbm_short); if (error != 0) return (error); /* Verify that the new bookmark does not already exist */ error = dsl_bookmark_lookup_impl(newbm_ds, newbm_short, &bmark_phys); switch (error) { case ESRCH: /* happy path: new bmark doesn't exist, proceed after switch */ - error = 0; break; case 0: error = SET_ERROR(EEXIST); goto eholdnewbmds; default: /* dsl_bookmark_lookup_impl already did SET_ERROR */ goto eholdnewbmds; } /* error is retval of the following if-cascade */ if (strchr(source, '@') != NULL) { dsl_dataset_t *source_snap_ds; ASSERT3S(snapshot_namecheck(source, NULL, NULL), ==, 0); error = dsl_dataset_hold(dp, source, FTAG, &source_snap_ds); if (error == 0) { VERIFY(source_snap_ds->ds_is_snapshot); /* * Verify that source snapshot is an earlier point in * newbm_ds's timeline (source may be newbm_ds's origin) */ if (!dsl_dataset_is_before(newbm_ds, source_snap_ds, 0)) error = SET_ERROR( ZFS_ERR_BOOKMARK_SOURCE_NOT_ANCESTOR); dsl_dataset_rele(source_snap_ds, FTAG); } } else if (strchr(source, '#') != NULL) { zfs_bookmark_phys_t source_phys; ASSERT3S(bookmark_namecheck(source, NULL, NULL), ==, 0); /* * Source must exists and be an earlier point in newbm_ds's * timeline (newbm_ds's origin may be a snap of source's ds) */ error = dsl_bookmark_lookup(dp, source, newbm_ds, &source_phys); switch (error) { case 0: break; /* happy path */ case EXDEV: error = SET_ERROR(ZFS_ERR_BOOKMARK_SOURCE_NOT_ANCESTOR); break; default: /* dsl_bookmark_lookup already did SET_ERROR */ break; } } else { /* * dsl_bookmark_create_nvl_validate validates that source is * either snapshot or bookmark */ panic("unreachable code: %s", source); } eholdnewbmds: dsl_dataset_rele(newbm_ds, FTAG); return (error); } int dsl_bookmark_create_check(void *arg, dmu_tx_t *tx) { dsl_bookmark_create_arg_t *dbca = arg; int rv = 0; int schema_err = 0; ASSERT3P(dbca, !=, NULL); ASSERT3P(dbca->dbca_bmarks, !=, NULL); /* dbca->dbca_errors is allowed to be NULL */ dsl_pool_t *dp = dmu_tx_pool(tx); if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_BOOKMARKS)) return (SET_ERROR(ENOTSUP)); if (dsl_bookmark_create_nvl_validate(dbca->dbca_bmarks) != 0) rv = schema_err = SET_ERROR(EINVAL); for (nvpair_t *pair = nvlist_next_nvpair(dbca->dbca_bmarks, NULL); pair != NULL; pair = nvlist_next_nvpair(dbca->dbca_bmarks, pair)) { char *new = nvpair_name(pair); int error = schema_err; if (error == 0) { char *source = fnvpair_value_string(pair); error = dsl_bookmark_create_check_impl(dp, new, source); if (error != 0) error = SET_ERROR(error); } if (error != 0) { rv = error; if (dbca->dbca_errors != NULL) fnvlist_add_int32(dbca->dbca_errors, new, error); } } return (rv); } static dsl_bookmark_node_t * dsl_bookmark_node_alloc(char *shortname) { dsl_bookmark_node_t *dbn = kmem_alloc(sizeof (*dbn), KM_SLEEP); dbn->dbn_name = spa_strdup(shortname); dbn->dbn_dirty = B_FALSE; mutex_init(&dbn->dbn_lock, NULL, MUTEX_DEFAULT, NULL); return (dbn); } /* * Set the fields in the zfs_bookmark_phys_t based on the specified snapshot. */ static void dsl_bookmark_set_phys(zfs_bookmark_phys_t *zbm, dsl_dataset_t *snap) { spa_t *spa = dsl_dataset_get_spa(snap); objset_t *mos = spa_get_dsl(spa)->dp_meta_objset; dsl_dataset_phys_t *dsp = dsl_dataset_phys(snap); memset(zbm, 0, sizeof (zfs_bookmark_phys_t)); zbm->zbm_guid = dsp->ds_guid; zbm->zbm_creation_txg = dsp->ds_creation_txg; zbm->zbm_creation_time = dsp->ds_creation_time; zbm->zbm_redaction_obj = 0; /* * If the dataset is encrypted create a larger bookmark to * accommodate the IVset guid. The IVset guid was added * after the encryption feature to prevent a problem with * raw sends. If we encounter an encrypted dataset without * an IVset guid we fall back to a normal bookmark. */ if (snap->ds_dir->dd_crypto_obj != 0 && spa_feature_is_enabled(spa, SPA_FEATURE_BOOKMARK_V2)) { (void) zap_lookup(mos, snap->ds_object, DS_FIELD_IVSET_GUID, sizeof (uint64_t), 1, &zbm->zbm_ivset_guid); } if (spa_feature_is_enabled(spa, SPA_FEATURE_BOOKMARK_WRITTEN)) { zbm->zbm_flags = ZBM_FLAG_SNAPSHOT_EXISTS | ZBM_FLAG_HAS_FBN; zbm->zbm_referenced_bytes_refd = dsp->ds_referenced_bytes; zbm->zbm_compressed_bytes_refd = dsp->ds_compressed_bytes; zbm->zbm_uncompressed_bytes_refd = dsp->ds_uncompressed_bytes; dsl_dataset_t *nextds; VERIFY0(dsl_dataset_hold_obj(snap->ds_dir->dd_pool, dsp->ds_next_snap_obj, FTAG, &nextds)); dsl_deadlist_space(&nextds->ds_deadlist, &zbm->zbm_referenced_freed_before_next_snap, &zbm->zbm_compressed_freed_before_next_snap, &zbm->zbm_uncompressed_freed_before_next_snap); dsl_dataset_rele(nextds, FTAG); } } /* * Add dsl_bookmark_node_t `dbn` to the given dataset and increment appropriate * SPA feature counters. */ void dsl_bookmark_node_add(dsl_dataset_t *hds, dsl_bookmark_node_t *dbn, dmu_tx_t *tx) { dsl_pool_t *dp = dmu_tx_pool(tx); objset_t *mos = dp->dp_meta_objset; if (hds->ds_bookmarks_obj == 0) { hds->ds_bookmarks_obj = zap_create_norm(mos, U8_TEXTPREP_TOUPPER, DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx); spa_feature_incr(dp->dp_spa, SPA_FEATURE_BOOKMARKS, tx); dsl_dataset_zapify(hds, tx); VERIFY0(zap_add(mos, hds->ds_object, DS_FIELD_BOOKMARK_NAMES, sizeof (hds->ds_bookmarks_obj), 1, &hds->ds_bookmarks_obj, tx)); } avl_add(&hds->ds_bookmarks, dbn); /* * To maintain backwards compatibility with software that doesn't * understand SPA_FEATURE_BOOKMARK_V2, we need to use the smallest * possible bookmark size. */ uint64_t bookmark_phys_size = BOOKMARK_PHYS_SIZE_V1; if (spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_BOOKMARK_V2) && (dbn->dbn_phys.zbm_ivset_guid != 0 || dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN || dbn->dbn_phys.zbm_redaction_obj != 0)) { bookmark_phys_size = BOOKMARK_PHYS_SIZE_V2; spa_feature_incr(dp->dp_spa, SPA_FEATURE_BOOKMARK_V2, tx); } zfs_bookmark_phys_t zero_phys = { 0 }; ASSERT0(memcmp(((char *)&dbn->dbn_phys) + bookmark_phys_size, &zero_phys, sizeof (zfs_bookmark_phys_t) - bookmark_phys_size)); VERIFY0(zap_add(mos, hds->ds_bookmarks_obj, dbn->dbn_name, sizeof (uint64_t), bookmark_phys_size / sizeof (uint64_t), &dbn->dbn_phys, tx)); } /* * If redaction_list is non-null, we create a redacted bookmark and redaction * list, and store the object number of the redaction list in redact_obj. */ static void dsl_bookmark_create_sync_impl_snap(const char *bookmark, const char *snapshot, dmu_tx_t *tx, uint64_t num_redact_snaps, uint64_t *redact_snaps, const void *tag, redaction_list_t **redaction_list) { dsl_pool_t *dp = dmu_tx_pool(tx); objset_t *mos = dp->dp_meta_objset; dsl_dataset_t *snapds, *bmark_fs; char *shortname; boolean_t bookmark_redacted; uint64_t *dsredactsnaps; uint64_t dsnumsnaps; VERIFY0(dsl_dataset_hold(dp, snapshot, FTAG, &snapds)); VERIFY0(dsl_bookmark_hold_ds(dp, bookmark, &bmark_fs, FTAG, &shortname)); dsl_bookmark_node_t *dbn = dsl_bookmark_node_alloc(shortname); dsl_bookmark_set_phys(&dbn->dbn_phys, snapds); bookmark_redacted = dsl_dataset_get_uint64_array_feature(snapds, SPA_FEATURE_REDACTED_DATASETS, &dsnumsnaps, &dsredactsnaps); if (redaction_list != NULL || bookmark_redacted) { redaction_list_t *local_rl; if (bookmark_redacted) { redact_snaps = dsredactsnaps; num_redact_snaps = dsnumsnaps; } dbn->dbn_phys.zbm_redaction_obj = dmu_object_alloc(mos, DMU_OTN_UINT64_METADATA, SPA_OLD_MAXBLOCKSIZE, DMU_OTN_UINT64_METADATA, sizeof (redaction_list_phys_t) + num_redact_snaps * sizeof (uint64_t), tx); spa_feature_incr(dp->dp_spa, SPA_FEATURE_REDACTION_BOOKMARKS, tx); VERIFY0(dsl_redaction_list_hold_obj(dp, dbn->dbn_phys.zbm_redaction_obj, tag, &local_rl)); dsl_redaction_list_long_hold(dp, local_rl, tag); ASSERT3U((local_rl)->rl_dbuf->db_size, >=, sizeof (redaction_list_phys_t) + num_redact_snaps * sizeof (uint64_t)); dmu_buf_will_dirty(local_rl->rl_dbuf, tx); memcpy(local_rl->rl_phys->rlp_snaps, redact_snaps, sizeof (uint64_t) * num_redact_snaps); local_rl->rl_phys->rlp_num_snaps = num_redact_snaps; if (bookmark_redacted) { ASSERT3P(redaction_list, ==, NULL); local_rl->rl_phys->rlp_last_blkid = UINT64_MAX; local_rl->rl_phys->rlp_last_object = UINT64_MAX; dsl_redaction_list_long_rele(local_rl, tag); dsl_redaction_list_rele(local_rl, tag); } else { *redaction_list = local_rl; } } if (dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN) { spa_feature_incr(dp->dp_spa, SPA_FEATURE_BOOKMARK_WRITTEN, tx); } dsl_bookmark_node_add(bmark_fs, dbn, tx); spa_history_log_internal_ds(bmark_fs, "bookmark", tx, "name=%s creation_txg=%llu target_snap=%llu redact_obj=%llu", shortname, (longlong_t)dbn->dbn_phys.zbm_creation_txg, (longlong_t)snapds->ds_object, (longlong_t)dbn->dbn_phys.zbm_redaction_obj); dsl_dataset_rele(bmark_fs, FTAG); dsl_dataset_rele(snapds, FTAG); } static void dsl_bookmark_create_sync_impl_book( const char *new_name, const char *source_name, dmu_tx_t *tx) { dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *bmark_fs_source, *bmark_fs_new; char *source_shortname, *new_shortname; zfs_bookmark_phys_t source_phys; VERIFY0(dsl_bookmark_hold_ds(dp, source_name, &bmark_fs_source, FTAG, &source_shortname)); VERIFY0(dsl_bookmark_hold_ds(dp, new_name, &bmark_fs_new, FTAG, &new_shortname)); /* * create a copy of the source bookmark by copying most of its members * * Caveat: bookmarking a redaction bookmark yields a normal bookmark * ----------------------------------------------------------------- * Reasoning: * - The zbm_redaction_obj would be referred to by both source and new * bookmark, but would be destroyed once either source or new is * destroyed, resulting in use-after-free of the referred object. * - User expectation when issuing the `zfs bookmark` command is that * a normal bookmark of the source is created * * Design Alternatives For Full Redaction Bookmark Copying: * - reference-count the redaction object => would require on-disk * format change for existing redaction objects * - Copy the redaction object => cannot be done in syncing context * because the redaction object might be too large */ VERIFY0(dsl_bookmark_lookup_impl(bmark_fs_source, source_shortname, &source_phys)); dsl_bookmark_node_t *new_dbn = dsl_bookmark_node_alloc(new_shortname); memcpy(&new_dbn->dbn_phys, &source_phys, sizeof (source_phys)); new_dbn->dbn_phys.zbm_redaction_obj = 0; /* update feature counters */ if (new_dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN) { spa_feature_incr(dp->dp_spa, SPA_FEATURE_BOOKMARK_WRITTEN, tx); } /* no need for redaction bookmark counter; nulled zbm_redaction_obj */ /* dsl_bookmark_node_add bumps bookmarks and v2-bookmarks counter */ /* * write new bookmark * * Note that dsl_bookmark_lookup_impl guarantees that, if source is a * v1 bookmark, the v2-only fields are zeroed. * And dsl_bookmark_node_add writes back a v1-sized bookmark if * v2 bookmarks are disabled and/or v2-only fields are zeroed. * => bookmark copying works on pre-bookmark-v2 pools */ dsl_bookmark_node_add(bmark_fs_new, new_dbn, tx); spa_history_log_internal_ds(bmark_fs_source, "bookmark", tx, "name=%s creation_txg=%llu source_guid=%llu", new_shortname, (longlong_t)new_dbn->dbn_phys.zbm_creation_txg, (longlong_t)source_phys.zbm_guid); dsl_dataset_rele(bmark_fs_source, FTAG); dsl_dataset_rele(bmark_fs_new, FTAG); } void dsl_bookmark_create_sync(void *arg, dmu_tx_t *tx) { dsl_bookmark_create_arg_t *dbca = arg; ASSERT(spa_feature_is_enabled(dmu_tx_pool(tx)->dp_spa, SPA_FEATURE_BOOKMARKS)); for (nvpair_t *pair = nvlist_next_nvpair(dbca->dbca_bmarks, NULL); pair != NULL; pair = nvlist_next_nvpair(dbca->dbca_bmarks, pair)) { char *new = nvpair_name(pair); char *source = fnvpair_value_string(pair); if (strchr(source, '@') != NULL) { dsl_bookmark_create_sync_impl_snap(new, source, tx, 0, NULL, NULL, NULL); } else if (strchr(source, '#') != NULL) { dsl_bookmark_create_sync_impl_book(new, source, tx); } else { panic("unreachable code"); } } } /* * The bookmarks must all be in the same pool. */ int dsl_bookmark_create(nvlist_t *bmarks, nvlist_t *errors) { nvpair_t *pair; dsl_bookmark_create_arg_t dbca; pair = nvlist_next_nvpair(bmarks, NULL); if (pair == NULL) return (0); dbca.dbca_bmarks = bmarks; dbca.dbca_errors = errors; return (dsl_sync_task(nvpair_name(pair), dsl_bookmark_create_check, dsl_bookmark_create_sync, &dbca, fnvlist_num_pairs(bmarks), ZFS_SPACE_CHECK_NORMAL)); } static int dsl_bookmark_create_redacted_check(void *arg, dmu_tx_t *tx) { dsl_bookmark_create_redacted_arg_t *dbcra = arg; dsl_pool_t *dp = dmu_tx_pool(tx); int rv = 0; if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_REDACTION_BOOKMARKS)) return (SET_ERROR(ENOTSUP)); /* * If the list of redact snaps will not fit in the bonus buffer with * the furthest reached object and offset, fail. */ if (dbcra->dbcra_numsnaps > (dmu_bonus_max() - sizeof (redaction_list_phys_t)) / sizeof (uint64_t)) return (SET_ERROR(E2BIG)); if (dsl_bookmark_create_nvl_validate_pair( dbcra->dbcra_bmark, dbcra->dbcra_snap) != 0) return (SET_ERROR(EINVAL)); rv = dsl_bookmark_create_check_impl(dp, dbcra->dbcra_bmark, dbcra->dbcra_snap); return (rv); } static void dsl_bookmark_create_redacted_sync(void *arg, dmu_tx_t *tx) { dsl_bookmark_create_redacted_arg_t *dbcra = arg; dsl_bookmark_create_sync_impl_snap(dbcra->dbcra_bmark, dbcra->dbcra_snap, tx, dbcra->dbcra_numsnaps, dbcra->dbcra_snaps, dbcra->dbcra_tag, dbcra->dbcra_rl); } int dsl_bookmark_create_redacted(const char *bookmark, const char *snapshot, uint64_t numsnaps, uint64_t *snapguids, const void *tag, redaction_list_t **rl) { dsl_bookmark_create_redacted_arg_t dbcra; dbcra.dbcra_bmark = bookmark; dbcra.dbcra_snap = snapshot; dbcra.dbcra_rl = rl; dbcra.dbcra_numsnaps = numsnaps; dbcra.dbcra_snaps = snapguids; dbcra.dbcra_tag = tag; return (dsl_sync_task(bookmark, dsl_bookmark_create_redacted_check, dsl_bookmark_create_redacted_sync, &dbcra, 5, ZFS_SPACE_CHECK_NORMAL)); } /* * Retrieve the list of properties given in the 'props' nvlist for a bookmark. * If 'props' is NULL, retrieves all properties. */ static void dsl_bookmark_fetch_props(dsl_pool_t *dp, zfs_bookmark_phys_t *bmark_phys, nvlist_t *props, nvlist_t *out_props) { ASSERT3P(dp, !=, NULL); ASSERT3P(bmark_phys, !=, NULL); ASSERT3P(out_props, !=, NULL); ASSERT(RRW_LOCK_HELD(&dp->dp_config_rwlock)); if (props == NULL || nvlist_exists(props, zfs_prop_to_name(ZFS_PROP_GUID))) { dsl_prop_nvlist_add_uint64(out_props, ZFS_PROP_GUID, bmark_phys->zbm_guid); } if (props == NULL || nvlist_exists(props, zfs_prop_to_name(ZFS_PROP_CREATETXG))) { dsl_prop_nvlist_add_uint64(out_props, ZFS_PROP_CREATETXG, bmark_phys->zbm_creation_txg); } if (props == NULL || nvlist_exists(props, zfs_prop_to_name(ZFS_PROP_CREATION))) { dsl_prop_nvlist_add_uint64(out_props, ZFS_PROP_CREATION, bmark_phys->zbm_creation_time); } if (props == NULL || nvlist_exists(props, zfs_prop_to_name(ZFS_PROP_IVSET_GUID))) { dsl_prop_nvlist_add_uint64(out_props, ZFS_PROP_IVSET_GUID, bmark_phys->zbm_ivset_guid); } if (bmark_phys->zbm_flags & ZBM_FLAG_HAS_FBN) { if (props == NULL || nvlist_exists(props, zfs_prop_to_name(ZFS_PROP_REFERENCED))) { dsl_prop_nvlist_add_uint64(out_props, ZFS_PROP_REFERENCED, bmark_phys->zbm_referenced_bytes_refd); } if (props == NULL || nvlist_exists(props, zfs_prop_to_name(ZFS_PROP_LOGICALREFERENCED))) { dsl_prop_nvlist_add_uint64(out_props, ZFS_PROP_LOGICALREFERENCED, bmark_phys->zbm_uncompressed_bytes_refd); } if (props == NULL || nvlist_exists(props, zfs_prop_to_name(ZFS_PROP_REFRATIO))) { uint64_t ratio = bmark_phys->zbm_compressed_bytes_refd == 0 ? 100 : bmark_phys->zbm_uncompressed_bytes_refd * 100 / bmark_phys->zbm_compressed_bytes_refd; dsl_prop_nvlist_add_uint64(out_props, ZFS_PROP_REFRATIO, ratio); } } if ((props == NULL || nvlist_exists(props, "redact_snaps") || nvlist_exists(props, "redact_complete")) && bmark_phys->zbm_redaction_obj != 0) { redaction_list_t *rl; int err = dsl_redaction_list_hold_obj(dp, bmark_phys->zbm_redaction_obj, FTAG, &rl); if (err == 0) { if (nvlist_exists(props, "redact_snaps")) { nvlist_t *nvl; nvl = fnvlist_alloc(); fnvlist_add_uint64_array(nvl, ZPROP_VALUE, rl->rl_phys->rlp_snaps, rl->rl_phys->rlp_num_snaps); fnvlist_add_nvlist(out_props, "redact_snaps", nvl); nvlist_free(nvl); } if (nvlist_exists(props, "redact_complete")) { nvlist_t *nvl; nvl = fnvlist_alloc(); fnvlist_add_boolean_value(nvl, ZPROP_VALUE, rl->rl_phys->rlp_last_blkid == UINT64_MAX && rl->rl_phys->rlp_last_object == UINT64_MAX); fnvlist_add_nvlist(out_props, "redact_complete", nvl); nvlist_free(nvl); } dsl_redaction_list_rele(rl, FTAG); } } } int dsl_get_bookmarks_impl(dsl_dataset_t *ds, nvlist_t *props, nvlist_t *outnvl) { dsl_pool_t *dp = ds->ds_dir->dd_pool; ASSERT(dsl_pool_config_held(dp)); if (dsl_dataset_is_snapshot(ds)) return (SET_ERROR(EINVAL)); for (dsl_bookmark_node_t *dbn = avl_first(&ds->ds_bookmarks); dbn != NULL; dbn = AVL_NEXT(&ds->ds_bookmarks, dbn)) { nvlist_t *out_props = fnvlist_alloc(); dsl_bookmark_fetch_props(dp, &dbn->dbn_phys, props, out_props); fnvlist_add_nvlist(outnvl, dbn->dbn_name, out_props); fnvlist_free(out_props); } return (0); } /* * Comparison func for ds_bookmarks AVL tree. We sort the bookmarks by * their TXG, then by their FBN-ness. The "FBN-ness" component ensures * that all bookmarks at the same TXG that HAS_FBN are adjacent, which * dsl_bookmark_destroy_sync_impl() depends on. Note that there may be * multiple bookmarks at the same TXG (with the same FBN-ness). In this * case we differentiate them by an arbitrary metric (in this case, * their names). */ static int dsl_bookmark_compare(const void *l, const void *r) { const dsl_bookmark_node_t *ldbn = l; const dsl_bookmark_node_t *rdbn = r; int64_t cmp = TREE_CMP(ldbn->dbn_phys.zbm_creation_txg, rdbn->dbn_phys.zbm_creation_txg); if (likely(cmp)) return (cmp); cmp = TREE_CMP((ldbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN), (rdbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN)); if (likely(cmp)) return (cmp); cmp = strcmp(ldbn->dbn_name, rdbn->dbn_name); return (TREE_ISIGN(cmp)); } /* * Cache this (head) dataset's bookmarks in the ds_bookmarks AVL tree. */ int dsl_bookmark_init_ds(dsl_dataset_t *ds) { dsl_pool_t *dp = ds->ds_dir->dd_pool; objset_t *mos = dp->dp_meta_objset; ASSERT(!ds->ds_is_snapshot); avl_create(&ds->ds_bookmarks, dsl_bookmark_compare, sizeof (dsl_bookmark_node_t), offsetof(dsl_bookmark_node_t, dbn_node)); if (!dsl_dataset_is_zapified(ds)) return (0); int zaperr = zap_lookup(mos, ds->ds_object, DS_FIELD_BOOKMARK_NAMES, sizeof (ds->ds_bookmarks_obj), 1, &ds->ds_bookmarks_obj); if (zaperr == ENOENT) return (0); if (zaperr != 0) return (zaperr); if (ds->ds_bookmarks_obj == 0) return (0); int err = 0; zap_cursor_t zc; zap_attribute_t attr; for (zap_cursor_init(&zc, mos, ds->ds_bookmarks_obj); (err = zap_cursor_retrieve(&zc, &attr)) == 0; zap_cursor_advance(&zc)) { dsl_bookmark_node_t *dbn = dsl_bookmark_node_alloc(attr.za_name); err = dsl_bookmark_lookup_impl(ds, dbn->dbn_name, &dbn->dbn_phys); ASSERT3U(err, !=, ENOENT); if (err != 0) { kmem_free(dbn, sizeof (*dbn)); break; } avl_add(&ds->ds_bookmarks, dbn); } zap_cursor_fini(&zc); if (err == ENOENT) err = 0; return (err); } void dsl_bookmark_fini_ds(dsl_dataset_t *ds) { void *cookie = NULL; dsl_bookmark_node_t *dbn; if (ds->ds_is_snapshot) return; while ((dbn = avl_destroy_nodes(&ds->ds_bookmarks, &cookie)) != NULL) { spa_strfree(dbn->dbn_name); mutex_destroy(&dbn->dbn_lock); kmem_free(dbn, sizeof (*dbn)); } avl_destroy(&ds->ds_bookmarks); } /* * Retrieve the bookmarks that exist in the specified dataset, and the * requested properties of each bookmark. * * The "props" nvlist specifies which properties are requested. * See lzc_get_bookmarks() for the list of valid properties. */ int dsl_get_bookmarks(const char *dsname, nvlist_t *props, nvlist_t *outnvl) { dsl_pool_t *dp; dsl_dataset_t *ds; int err; err = dsl_pool_hold(dsname, FTAG, &dp); if (err != 0) return (err); err = dsl_dataset_hold(dp, dsname, FTAG, &ds); if (err != 0) { dsl_pool_rele(dp, FTAG); return (err); } err = dsl_get_bookmarks_impl(ds, props, outnvl); dsl_dataset_rele(ds, FTAG); dsl_pool_rele(dp, FTAG); return (err); } /* * Retrieve all properties for a single bookmark in the given dataset. */ int dsl_get_bookmark_props(const char *dsname, const char *bmname, nvlist_t *props) { dsl_pool_t *dp; dsl_dataset_t *ds; zfs_bookmark_phys_t bmark_phys = { 0 }; int err; err = dsl_pool_hold(dsname, FTAG, &dp); if (err != 0) return (err); err = dsl_dataset_hold(dp, dsname, FTAG, &ds); if (err != 0) { dsl_pool_rele(dp, FTAG); return (err); } err = dsl_bookmark_lookup_impl(ds, bmname, &bmark_phys); if (err != 0) goto out; dsl_bookmark_fetch_props(dp, &bmark_phys, NULL, props); out: dsl_dataset_rele(ds, FTAG); dsl_pool_rele(dp, FTAG); return (err); } typedef struct dsl_bookmark_destroy_arg { nvlist_t *dbda_bmarks; nvlist_t *dbda_success; nvlist_t *dbda_errors; } dsl_bookmark_destroy_arg_t; static void dsl_bookmark_destroy_sync_impl(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx) { objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; uint64_t bmark_zapobj = ds->ds_bookmarks_obj; matchtype_t mt = 0; uint64_t int_size, num_ints; /* * 'search' must be zeroed so that dbn_flags (which is used in * dsl_bookmark_compare()) will be zeroed even if the on-disk * (in ZAP) bookmark is shorter than offsetof(dbn_flags). */ dsl_bookmark_node_t search = { 0 }; char realname[ZFS_MAX_DATASET_NAME_LEN]; /* * Find the real name of this bookmark, which may be different * from the given name if the dataset is case-insensitive. Then * use the real name to find the node in the ds_bookmarks AVL tree. */ if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET) mt = MT_NORMALIZE; VERIFY0(zap_length(mos, bmark_zapobj, name, &int_size, &num_ints)); ASSERT3U(int_size, ==, sizeof (uint64_t)); if (num_ints * int_size > BOOKMARK_PHYS_SIZE_V1) { spa_feature_decr(dmu_objset_spa(mos), SPA_FEATURE_BOOKMARK_V2, tx); } VERIFY0(zap_lookup_norm(mos, bmark_zapobj, name, sizeof (uint64_t), num_ints, &search.dbn_phys, mt, realname, sizeof (realname), NULL)); search.dbn_name = realname; dsl_bookmark_node_t *dbn = avl_find(&ds->ds_bookmarks, &search, NULL); ASSERT(dbn != NULL); if (dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN) { /* * If this bookmark HAS_FBN, and it is before the most * recent snapshot, then its TXG is a key in the head's * deadlist (and all clones' heads' deadlists). If this is * the last thing keeping the key (i.e. there are no more * bookmarks with HAS_FBN at this TXG, and there is no * snapshot at this TXG), then remove the key. * * Note that this algorithm depends on ds_bookmarks being * sorted such that all bookmarks at the same TXG with * HAS_FBN are adjacent (with no non-HAS_FBN bookmarks * at the same TXG in between them). If this were not * the case, we would need to examine *all* bookmarks * at this TXG, rather than just the adjacent ones. */ dsl_bookmark_node_t *dbn_prev = AVL_PREV(&ds->ds_bookmarks, dbn); dsl_bookmark_node_t *dbn_next = AVL_NEXT(&ds->ds_bookmarks, dbn); boolean_t more_bookmarks_at_this_txg = (dbn_prev != NULL && dbn_prev->dbn_phys.zbm_creation_txg == dbn->dbn_phys.zbm_creation_txg && (dbn_prev->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN)) || (dbn_next != NULL && dbn_next->dbn_phys.zbm_creation_txg == dbn->dbn_phys.zbm_creation_txg && (dbn_next->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN)); if (!(dbn->dbn_phys.zbm_flags & ZBM_FLAG_SNAPSHOT_EXISTS) && !more_bookmarks_at_this_txg && dbn->dbn_phys.zbm_creation_txg < dsl_dataset_phys(ds)->ds_prev_snap_txg) { dsl_dir_remove_clones_key(ds->ds_dir, dbn->dbn_phys.zbm_creation_txg, tx); dsl_deadlist_remove_key(&ds->ds_deadlist, dbn->dbn_phys.zbm_creation_txg, tx); } spa_feature_decr(dmu_objset_spa(mos), SPA_FEATURE_BOOKMARK_WRITTEN, tx); } if (dbn->dbn_phys.zbm_redaction_obj != 0) { VERIFY0(dmu_object_free(mos, dbn->dbn_phys.zbm_redaction_obj, tx)); spa_feature_decr(dmu_objset_spa(mos), SPA_FEATURE_REDACTION_BOOKMARKS, tx); } avl_remove(&ds->ds_bookmarks, dbn); spa_strfree(dbn->dbn_name); mutex_destroy(&dbn->dbn_lock); kmem_free(dbn, sizeof (*dbn)); VERIFY0(zap_remove_norm(mos, bmark_zapobj, name, mt, tx)); } static int dsl_bookmark_destroy_check(void *arg, dmu_tx_t *tx) { dsl_bookmark_destroy_arg_t *dbda = arg; dsl_pool_t *dp = dmu_tx_pool(tx); int rv = 0; ASSERT(nvlist_empty(dbda->dbda_success)); ASSERT(nvlist_empty(dbda->dbda_errors)); if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_BOOKMARKS)) return (0); for (nvpair_t *pair = nvlist_next_nvpair(dbda->dbda_bmarks, NULL); pair != NULL; pair = nvlist_next_nvpair(dbda->dbda_bmarks, pair)) { const char *fullname = nvpair_name(pair); dsl_dataset_t *ds; zfs_bookmark_phys_t bm; int error; char *shortname; error = dsl_bookmark_hold_ds(dp, fullname, &ds, FTAG, &shortname); if (error == ENOENT) { /* ignore it; the bookmark is "already destroyed" */ continue; } if (error == 0) { error = dsl_bookmark_lookup_impl(ds, shortname, &bm); dsl_dataset_rele(ds, FTAG); if (error == ESRCH) { /* * ignore it; the bookmark is * "already destroyed" */ continue; } if (error == 0 && bm.zbm_redaction_obj != 0) { redaction_list_t *rl = NULL; error = dsl_redaction_list_hold_obj(tx->tx_pool, bm.zbm_redaction_obj, FTAG, &rl); if (error == ENOENT) { error = 0; } else if (error == 0 && dsl_redaction_list_long_held(rl)) { error = SET_ERROR(EBUSY); } if (rl != NULL) { dsl_redaction_list_rele(rl, FTAG); } } } if (error == 0) { if (dmu_tx_is_syncing(tx)) { fnvlist_add_boolean(dbda->dbda_success, fullname); } } else { fnvlist_add_int32(dbda->dbda_errors, fullname, error); rv = error; } } return (rv); } static void dsl_bookmark_destroy_sync(void *arg, dmu_tx_t *tx) { dsl_bookmark_destroy_arg_t *dbda = arg; dsl_pool_t *dp = dmu_tx_pool(tx); objset_t *mos = dp->dp_meta_objset; for (nvpair_t *pair = nvlist_next_nvpair(dbda->dbda_success, NULL); pair != NULL; pair = nvlist_next_nvpair(dbda->dbda_success, pair)) { dsl_dataset_t *ds; char *shortname; uint64_t zap_cnt; VERIFY0(dsl_bookmark_hold_ds(dp, nvpair_name(pair), &ds, FTAG, &shortname)); dsl_bookmark_destroy_sync_impl(ds, shortname, tx); /* * If all of this dataset's bookmarks have been destroyed, * free the zap object and decrement the feature's use count. */ VERIFY0(zap_count(mos, ds->ds_bookmarks_obj, &zap_cnt)); if (zap_cnt == 0) { dmu_buf_will_dirty(ds->ds_dbuf, tx); VERIFY0(zap_destroy(mos, ds->ds_bookmarks_obj, tx)); ds->ds_bookmarks_obj = 0; spa_feature_decr(dp->dp_spa, SPA_FEATURE_BOOKMARKS, tx); VERIFY0(zap_remove(mos, ds->ds_object, DS_FIELD_BOOKMARK_NAMES, tx)); } spa_history_log_internal_ds(ds, "remove bookmark", tx, "name=%s", shortname); dsl_dataset_rele(ds, FTAG); } } /* * The bookmarks must all be in the same pool. */ int dsl_bookmark_destroy(nvlist_t *bmarks, nvlist_t *errors) { int rv; dsl_bookmark_destroy_arg_t dbda; nvpair_t *pair = nvlist_next_nvpair(bmarks, NULL); if (pair == NULL) return (0); dbda.dbda_bmarks = bmarks; dbda.dbda_errors = errors; dbda.dbda_success = fnvlist_alloc(); rv = dsl_sync_task(nvpair_name(pair), dsl_bookmark_destroy_check, dsl_bookmark_destroy_sync, &dbda, fnvlist_num_pairs(bmarks), ZFS_SPACE_CHECK_RESERVED); fnvlist_free(dbda.dbda_success); return (rv); } /* Return B_TRUE if there are any long holds on this dataset. */ boolean_t dsl_redaction_list_long_held(redaction_list_t *rl) { return (!zfs_refcount_is_zero(&rl->rl_longholds)); } void dsl_redaction_list_long_hold(dsl_pool_t *dp, redaction_list_t *rl, const void *tag) { ASSERT(dsl_pool_config_held(dp)); (void) zfs_refcount_add(&rl->rl_longholds, tag); } void dsl_redaction_list_long_rele(redaction_list_t *rl, const void *tag) { (void) zfs_refcount_remove(&rl->rl_longholds, tag); } static void redaction_list_evict_sync(void *rlu) { redaction_list_t *rl = rlu; zfs_refcount_destroy(&rl->rl_longholds); kmem_free(rl, sizeof (redaction_list_t)); } void dsl_redaction_list_rele(redaction_list_t *rl, const void *tag) { dmu_buf_rele(rl->rl_dbuf, tag); } int dsl_redaction_list_hold_obj(dsl_pool_t *dp, uint64_t rlobj, const void *tag, redaction_list_t **rlp) { objset_t *mos = dp->dp_meta_objset; dmu_buf_t *dbuf; redaction_list_t *rl; int err; ASSERT(dsl_pool_config_held(dp)); err = dmu_bonus_hold(mos, rlobj, tag, &dbuf); if (err != 0) return (err); rl = dmu_buf_get_user(dbuf); if (rl == NULL) { redaction_list_t *winner = NULL; rl = kmem_zalloc(sizeof (redaction_list_t), KM_SLEEP); rl->rl_dbuf = dbuf; rl->rl_object = rlobj; rl->rl_phys = dbuf->db_data; rl->rl_mos = dp->dp_meta_objset; zfs_refcount_create(&rl->rl_longholds); dmu_buf_init_user(&rl->rl_dbu, redaction_list_evict_sync, NULL, &rl->rl_dbuf); if ((winner = dmu_buf_set_user_ie(dbuf, &rl->rl_dbu)) != NULL) { kmem_free(rl, sizeof (*rl)); rl = winner; } } *rlp = rl; return (0); } /* * Snapshot ds is being destroyed. * * Adjust the "freed_before_next" of any bookmarks between this snap * and the previous snapshot, because their "next snapshot" is changing. * * If there are any bookmarks with HAS_FBN at this snapshot, remove * their HAS_SNAP flag (note: there can be at most one snapshot of * each filesystem at a given txg), and return B_TRUE. In this case * the caller can not remove the key in the deadlist at this TXG, because * the HAS_FBN bookmarks require the key be there. * * Returns B_FALSE if there are no bookmarks with HAS_FBN at this * snapshot's TXG. In this case the caller can remove the key in the * deadlist at this TXG. */ boolean_t dsl_bookmark_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx) { dsl_pool_t *dp = ds->ds_dir->dd_pool; dsl_dataset_t *head, *next; VERIFY0(dsl_dataset_hold_obj(dp, dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj, FTAG, &head)); VERIFY0(dsl_dataset_hold_obj(dp, dsl_dataset_phys(ds)->ds_next_snap_obj, FTAG, &next)); /* * Find the first bookmark that HAS_FBN at or after the * previous snapshot. */ dsl_bookmark_node_t search = { 0 }; avl_index_t idx; search.dbn_phys.zbm_creation_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; search.dbn_phys.zbm_flags = ZBM_FLAG_HAS_FBN; /* * The empty-string name can't be in the AVL, and it compares * before any entries with this TXG. */ search.dbn_name = (char *)""; VERIFY3P(avl_find(&head->ds_bookmarks, &search, &idx), ==, NULL); dsl_bookmark_node_t *dbn = avl_nearest(&head->ds_bookmarks, idx, AVL_AFTER); /* * Iterate over all bookmarks that are at or after the previous * snapshot, and before this (being deleted) snapshot. Adjust * their FBN based on their new next snapshot. */ for (; dbn != NULL && dbn->dbn_phys.zbm_creation_txg < dsl_dataset_phys(ds)->ds_creation_txg; dbn = AVL_NEXT(&head->ds_bookmarks, dbn)) { if (!(dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN)) continue; /* * Increase our FBN by the amount of space that was live * (referenced) at the time of this bookmark (i.e. * birth <= zbm_creation_txg), and killed between this * (being deleted) snapshot and the next snapshot (i.e. * on the next snapshot's deadlist). (Space killed before * this are already on our FBN.) */ uint64_t referenced, compressed, uncompressed; dsl_deadlist_space_range(&next->ds_deadlist, 0, dbn->dbn_phys.zbm_creation_txg, &referenced, &compressed, &uncompressed); dbn->dbn_phys.zbm_referenced_freed_before_next_snap += referenced; dbn->dbn_phys.zbm_compressed_freed_before_next_snap += compressed; dbn->dbn_phys.zbm_uncompressed_freed_before_next_snap += uncompressed; VERIFY0(zap_update(dp->dp_meta_objset, head->ds_bookmarks_obj, dbn->dbn_name, sizeof (uint64_t), sizeof (zfs_bookmark_phys_t) / sizeof (uint64_t), &dbn->dbn_phys, tx)); } dsl_dataset_rele(next, FTAG); /* * There may be several bookmarks at this txg (the TXG of the * snapshot being deleted). We need to clear the SNAPSHOT_EXISTS * flag on all of them, and return TRUE if there is at least 1 * bookmark here with HAS_FBN (thus preventing the deadlist * key from being removed). */ boolean_t rv = B_FALSE; for (; dbn != NULL && dbn->dbn_phys.zbm_creation_txg == dsl_dataset_phys(ds)->ds_creation_txg; dbn = AVL_NEXT(&head->ds_bookmarks, dbn)) { if (!(dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN)) { ASSERT(!(dbn->dbn_phys.zbm_flags & ZBM_FLAG_SNAPSHOT_EXISTS)); continue; } ASSERT(dbn->dbn_phys.zbm_flags & ZBM_FLAG_SNAPSHOT_EXISTS); dbn->dbn_phys.zbm_flags &= ~ZBM_FLAG_SNAPSHOT_EXISTS; VERIFY0(zap_update(dp->dp_meta_objset, head->ds_bookmarks_obj, dbn->dbn_name, sizeof (uint64_t), sizeof (zfs_bookmark_phys_t) / sizeof (uint64_t), &dbn->dbn_phys, tx)); rv = B_TRUE; } dsl_dataset_rele(head, FTAG); return (rv); } /* * A snapshot is being created of this (head) dataset. * * We don't keep keys in the deadlist for the most recent snapshot, or any * bookmarks at or after it, because there can't be any blocks on the * deadlist in this range. Now that the most recent snapshot is after * all bookmarks, we need to add these keys. Note that the caller always * adds a key at the previous snapshot, so we only add keys for bookmarks * after that. */ void dsl_bookmark_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx) { uint64_t last_key_added = UINT64_MAX; for (dsl_bookmark_node_t *dbn = avl_last(&ds->ds_bookmarks); dbn != NULL && dbn->dbn_phys.zbm_creation_txg > dsl_dataset_phys(ds)->ds_prev_snap_txg; dbn = AVL_PREV(&ds->ds_bookmarks, dbn)) { uint64_t creation_txg = dbn->dbn_phys.zbm_creation_txg; ASSERT3U(creation_txg, <=, last_key_added); /* * Note, there may be multiple bookmarks at this TXG, * and we only want to add the key for this TXG once. * The ds_bookmarks AVL is sorted by TXG, so we will visit * these bookmarks in sequence. */ if ((dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN) && creation_txg != last_key_added) { dsl_deadlist_add_key(&ds->ds_deadlist, creation_txg, tx); last_key_added = creation_txg; } } } /* * The next snapshot of the origin dataset has changed, due to * promote or clone swap. If there are any bookmarks at this dataset, * we need to update their zbm_*_freed_before_next_snap to reflect this. * The head dataset has the relevant bookmarks in ds_bookmarks. */ void dsl_bookmark_next_changed(dsl_dataset_t *head, dsl_dataset_t *origin, dmu_tx_t *tx) { dsl_pool_t *dp = dmu_tx_pool(tx); /* * Find the first bookmark that HAS_FBN at the origin snapshot. */ dsl_bookmark_node_t search = { 0 }; avl_index_t idx; search.dbn_phys.zbm_creation_txg = dsl_dataset_phys(origin)->ds_creation_txg; search.dbn_phys.zbm_flags = ZBM_FLAG_HAS_FBN; /* * The empty-string name can't be in the AVL, and it compares * before any entries with this TXG. */ search.dbn_name = (char *)""; VERIFY3P(avl_find(&head->ds_bookmarks, &search, &idx), ==, NULL); dsl_bookmark_node_t *dbn = avl_nearest(&head->ds_bookmarks, idx, AVL_AFTER); /* * Iterate over all bookmarks that are at the origin txg. * Adjust their FBN based on their new next snapshot. */ for (; dbn != NULL && dbn->dbn_phys.zbm_creation_txg == dsl_dataset_phys(origin)->ds_creation_txg && (dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN); dbn = AVL_NEXT(&head->ds_bookmarks, dbn)) { /* * Bookmark is at the origin, therefore its * "next dataset" is changing, so we need * to reset its FBN by recomputing it in * dsl_bookmark_set_phys(). */ ASSERT3U(dbn->dbn_phys.zbm_guid, ==, dsl_dataset_phys(origin)->ds_guid); ASSERT3U(dbn->dbn_phys.zbm_referenced_bytes_refd, ==, dsl_dataset_phys(origin)->ds_referenced_bytes); ASSERT(dbn->dbn_phys.zbm_flags & ZBM_FLAG_SNAPSHOT_EXISTS); /* * Save and restore the zbm_redaction_obj, which * is zeroed by dsl_bookmark_set_phys(). */ uint64_t redaction_obj = dbn->dbn_phys.zbm_redaction_obj; dsl_bookmark_set_phys(&dbn->dbn_phys, origin); dbn->dbn_phys.zbm_redaction_obj = redaction_obj; VERIFY0(zap_update(dp->dp_meta_objset, head->ds_bookmarks_obj, dbn->dbn_name, sizeof (uint64_t), sizeof (zfs_bookmark_phys_t) / sizeof (uint64_t), &dbn->dbn_phys, tx)); } } /* * This block is no longer referenced by this (head) dataset. * * Adjust the FBN of any bookmarks that reference this block, whose "next" * is the head dataset. */ void dsl_bookmark_block_killed(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx) { (void) tx; /* * Iterate over bookmarks whose "next" is the head dataset. */ for (dsl_bookmark_node_t *dbn = avl_last(&ds->ds_bookmarks); dbn != NULL && dbn->dbn_phys.zbm_creation_txg >= dsl_dataset_phys(ds)->ds_prev_snap_txg; dbn = AVL_PREV(&ds->ds_bookmarks, dbn)) { /* * If the block was live (referenced) at the time of this * bookmark, add its space to the bookmark's FBN. */ if (bp->blk_birth <= dbn->dbn_phys.zbm_creation_txg && (dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN)) { mutex_enter(&dbn->dbn_lock); dbn->dbn_phys.zbm_referenced_freed_before_next_snap += bp_get_dsize_sync(dsl_dataset_get_spa(ds), bp); dbn->dbn_phys.zbm_compressed_freed_before_next_snap += BP_GET_PSIZE(bp); dbn->dbn_phys.zbm_uncompressed_freed_before_next_snap += BP_GET_UCSIZE(bp); /* * Changing the ZAP object here would be too * expensive. Also, we may be called from the zio * interrupt thread, which can't block on i/o. * Therefore, we mark this bookmark as dirty and * modify the ZAP once per txg, in * dsl_bookmark_sync_done(). */ dbn->dbn_dirty = B_TRUE; mutex_exit(&dbn->dbn_lock); } } } void dsl_bookmark_sync_done(dsl_dataset_t *ds, dmu_tx_t *tx) { dsl_pool_t *dp = dmu_tx_pool(tx); if (dsl_dataset_is_snapshot(ds)) return; /* * We only dirty bookmarks that are at or after the most recent * snapshot. We can't create snapshots between * dsl_bookmark_block_killed() and dsl_bookmark_sync_done(), so we * don't need to look at any bookmarks before ds_prev_snap_txg. */ for (dsl_bookmark_node_t *dbn = avl_last(&ds->ds_bookmarks); dbn != NULL && dbn->dbn_phys.zbm_creation_txg >= dsl_dataset_phys(ds)->ds_prev_snap_txg; dbn = AVL_PREV(&ds->ds_bookmarks, dbn)) { if (dbn->dbn_dirty) { /* * We only dirty nodes with HAS_FBN, therefore * we can always use the current bookmark struct size. */ ASSERT(dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN); VERIFY0(zap_update(dp->dp_meta_objset, ds->ds_bookmarks_obj, dbn->dbn_name, sizeof (uint64_t), sizeof (zfs_bookmark_phys_t) / sizeof (uint64_t), &dbn->dbn_phys, tx)); dbn->dbn_dirty = B_FALSE; } } #ifdef ZFS_DEBUG for (dsl_bookmark_node_t *dbn = avl_first(&ds->ds_bookmarks); dbn != NULL; dbn = AVL_NEXT(&ds->ds_bookmarks, dbn)) { ASSERT(!dbn->dbn_dirty); } #endif } /* * Return the TXG of the most recent bookmark (or 0 if there are no bookmarks). */ uint64_t dsl_bookmark_latest_txg(dsl_dataset_t *ds) { ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool)); dsl_bookmark_node_t *dbn = avl_last(&ds->ds_bookmarks); if (dbn == NULL) return (0); return (dbn->dbn_phys.zbm_creation_txg); } /* * Compare the redact_block_phys_t to the bookmark. If the last block in the * redact_block_phys_t is before the bookmark, return -1. If the first block in * the redact_block_phys_t is after the bookmark, return 1. Otherwise, the * bookmark is inside the range of the redact_block_phys_t, and we return 0. */ static int redact_block_zb_compare(redact_block_phys_t *first, zbookmark_phys_t *second) { /* * If the block_phys is for a previous object, or the last block in the * block_phys is strictly before the block in the bookmark, the * block_phys is earlier. */ if (first->rbp_object < second->zb_object || (first->rbp_object == second->zb_object && first->rbp_blkid + (redact_block_get_count(first) - 1) < second->zb_blkid)) { return (-1); } /* * If the bookmark is for a previous object, or the block in the * bookmark is strictly before the first block in the block_phys, the * bookmark is earlier. */ if (first->rbp_object > second->zb_object || (first->rbp_object == second->zb_object && first->rbp_blkid > second->zb_blkid)) { return (1); } return (0); } /* * Traverse the redaction list in the provided object, and call the callback for * each entry we find. Don't call the callback for any records before resume. */ int dsl_redaction_list_traverse(redaction_list_t *rl, zbookmark_phys_t *resume, rl_traverse_callback_t cb, void *arg) { objset_t *mos = rl->rl_mos; int err = 0; if (rl->rl_phys->rlp_last_object != UINT64_MAX || rl->rl_phys->rlp_last_blkid != UINT64_MAX) { /* * When we finish a send, we update the last object and offset * to UINT64_MAX. If a send fails partway through, the last * object and offset will have some other value, indicating how * far the send got. The redaction list must be complete before * it can be traversed, so return EINVAL if the last object and * blkid are not set to UINT64_MAX. */ return (SET_ERROR(EINVAL)); } /* * This allows us to skip the binary search and resume checking logic * below, if we're not resuming a redacted send. */ if (ZB_IS_ZERO(resume)) resume = NULL; /* * Binary search for the point to resume from. */ uint64_t maxidx = rl->rl_phys->rlp_num_entries - 1; uint64_t minidx = 0; while (resume != NULL && maxidx > minidx) { redact_block_phys_t rbp = { 0 }; ASSERT3U(maxidx, >, minidx); uint64_t mididx = minidx + ((maxidx - minidx) / 2); err = dmu_read(mos, rl->rl_object, mididx * sizeof (rbp), sizeof (rbp), &rbp, DMU_READ_NO_PREFETCH); if (err != 0) break; int cmp = redact_block_zb_compare(&rbp, resume); if (cmp == 0) { minidx = mididx; break; } else if (cmp > 0) { maxidx = (mididx == minidx ? minidx : mididx - 1); } else { minidx = mididx + 1; } } unsigned int bufsize = SPA_OLD_MAXBLOCKSIZE; redact_block_phys_t *buf = zio_data_buf_alloc(bufsize); unsigned int entries_per_buf = bufsize / sizeof (redact_block_phys_t); uint64_t start_block = minidx / entries_per_buf; err = dmu_read(mos, rl->rl_object, start_block * bufsize, bufsize, buf, DMU_READ_PREFETCH); for (uint64_t curidx = minidx; err == 0 && curidx < rl->rl_phys->rlp_num_entries; curidx++) { /* * We read in the redaction list one block at a time. Once we * finish with all the entries in a given block, we read in a * new one. The predictive prefetcher will take care of any * prefetching, and this code shouldn't be the bottleneck, so we * don't need to do manual prefetching. */ if (curidx % entries_per_buf == 0) { err = dmu_read(mos, rl->rl_object, curidx * sizeof (*buf), bufsize, buf, DMU_READ_PREFETCH); if (err != 0) break; } redact_block_phys_t *rb = &buf[curidx % entries_per_buf]; /* * If resume is non-null, we should either not send the data, or * null out resume so we don't have to keep doing these * comparisons. */ if (resume != NULL) { /* * It is possible that after the binary search we got * a record before the resume point. There's two cases * where this can occur. If the record is the last * redaction record, and the resume point is after the * end of the redacted data, curidx will be the last * redaction record. In that case, the loop will end * after this iteration. The second case is if the * resume point is between two redaction records, the * binary search can return either the record before * or after the resume point. In that case, the next * iteration will be greater than the resume point. */ if (redact_block_zb_compare(rb, resume) < 0) { ASSERT3U(curidx, ==, minidx); continue; } else { /* * If the place to resume is in the middle of * the range described by this * redact_block_phys, then modify the * redact_block_phys in memory so we generate * the right records. */ if (resume->zb_object == rb->rbp_object && resume->zb_blkid > rb->rbp_blkid) { uint64_t diff = resume->zb_blkid - rb->rbp_blkid; rb->rbp_blkid = resume->zb_blkid; redact_block_set_count(rb, redact_block_get_count(rb) - diff); } resume = NULL; } } if (cb(rb, arg) != 0) { err = EINTR; break; } } zio_data_buf_free(buf, bufsize); return (err); } diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c index 7a066b786cd0..c7577fc584af 100644 --- a/module/zfs/dsl_dataset.c +++ b/module/zfs/dsl_dataset.c @@ -1,5019 +1,5020 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright (c) 2014, Joyent, Inc. All rights reserved. * Copyright (c) 2014 RackTop Systems. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright (c) 2016 Actifio, Inc. All rights reserved. * Copyright 2016, OmniTI Computer Consulting, Inc. All rights reserved. * Copyright 2017 Nexenta Systems, Inc. * Copyright (c) 2019, Klara Inc. * Copyright (c) 2019, Allan Jude * Copyright (c) 2020 The FreeBSD Foundation [1] * * [1] Portions of this software were developed by Allan Jude * under sponsorship from the FreeBSD Foundation. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * The SPA supports block sizes up to 16MB. However, very large blocks * can have an impact on i/o latency (e.g. tying up a spinning disk for * ~300ms), and also potentially on the memory allocator. Therefore, * we did not allow the recordsize to be set larger than zfs_max_recordsize * (former default: 1MB). Larger blocks could be created by changing this * tunable, and pools with larger blocks could always be imported and used, * regardless of this setting. * * We do, however, still limit it by default to 1M on x86_32, because Linux's * 3/1 memory split doesn't leave much room for 16M chunks. */ #ifdef _ILP32 uint_t zfs_max_recordsize = 1 * 1024 * 1024; #else uint_t zfs_max_recordsize = 16 * 1024 * 1024; #endif static int zfs_allow_redacted_dataset_mount = 0; int zfs_snapshot_history_enabled = 1; #define SWITCH64(x, y) \ { \ uint64_t __tmp = (x); \ (x) = (y); \ (y) = __tmp; \ } #define DS_REF_MAX (1ULL << 62) static void dsl_dataset_set_remap_deadlist_object(dsl_dataset_t *ds, uint64_t obj, dmu_tx_t *tx); static void dsl_dataset_unset_remap_deadlist_object(dsl_dataset_t *ds, dmu_tx_t *tx); static void unload_zfeature(dsl_dataset_t *ds, spa_feature_t f); extern uint_t spa_asize_inflation; static zil_header_t zero_zil; /* * Figure out how much of this delta should be propagated to the dsl_dir * layer. If there's a refreservation, that space has already been * partially accounted for in our ancestors. */ static int64_t parent_delta(dsl_dataset_t *ds, int64_t delta) { dsl_dataset_phys_t *ds_phys; uint64_t old_bytes, new_bytes; if (ds->ds_reserved == 0) return (delta); ds_phys = dsl_dataset_phys(ds); old_bytes = MAX(ds_phys->ds_unique_bytes, ds->ds_reserved); new_bytes = MAX(ds_phys->ds_unique_bytes + delta, ds->ds_reserved); ASSERT3U(ABS((int64_t)(new_bytes - old_bytes)), <=, ABS(delta)); return (new_bytes - old_bytes); } void dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx) { spa_t *spa = dmu_tx_pool(tx)->dp_spa; int used = bp_get_dsize_sync(spa, bp); int compressed = BP_GET_PSIZE(bp); int uncompressed = BP_GET_UCSIZE(bp); int64_t delta; spa_feature_t f; dprintf_bp(bp, "ds=%p", ds); ASSERT(dmu_tx_is_syncing(tx)); /* It could have been compressed away to nothing */ if (BP_IS_HOLE(bp) || BP_IS_REDACTED(bp)) return; ASSERT(BP_GET_TYPE(bp) != DMU_OT_NONE); ASSERT(DMU_OT_IS_VALID(BP_GET_TYPE(bp))); if (ds == NULL) { dsl_pool_mos_diduse_space(tx->tx_pool, used, compressed, uncompressed); return; } ASSERT3U(bp->blk_birth, >, dsl_dataset_phys(ds)->ds_prev_snap_txg); dmu_buf_will_dirty(ds->ds_dbuf, tx); mutex_enter(&ds->ds_lock); delta = parent_delta(ds, used); dsl_dataset_phys(ds)->ds_referenced_bytes += used; dsl_dataset_phys(ds)->ds_compressed_bytes += compressed; dsl_dataset_phys(ds)->ds_uncompressed_bytes += uncompressed; dsl_dataset_phys(ds)->ds_unique_bytes += used; if (BP_GET_LSIZE(bp) > SPA_OLD_MAXBLOCKSIZE) { ds->ds_feature_activation[SPA_FEATURE_LARGE_BLOCKS] = (void *)B_TRUE; } f = zio_checksum_to_feature(BP_GET_CHECKSUM(bp)); if (f != SPA_FEATURE_NONE) { ASSERT3S(spa_feature_table[f].fi_type, ==, ZFEATURE_TYPE_BOOLEAN); ds->ds_feature_activation[f] = (void *)B_TRUE; } f = zio_compress_to_feature(BP_GET_COMPRESS(bp)); if (f != SPA_FEATURE_NONE) { ASSERT3S(spa_feature_table[f].fi_type, ==, ZFEATURE_TYPE_BOOLEAN); ds->ds_feature_activation[f] = (void *)B_TRUE; } /* * Track block for livelist, but ignore embedded blocks because * they do not need to be freed. */ if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) && bp->blk_birth > ds->ds_dir->dd_origin_txg && !(BP_IS_EMBEDDED(bp))) { ASSERT(dsl_dir_is_clone(ds->ds_dir)); ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_LIVELIST)); bplist_append(&ds->ds_dir->dd_pending_allocs, bp); } mutex_exit(&ds->ds_lock); dsl_dir_diduse_transfer_space(ds->ds_dir, delta, compressed, uncompressed, used, DD_USED_REFRSRV, DD_USED_HEAD, tx); } /* * Called when the specified segment has been remapped, and is thus no * longer referenced in the head dataset. The vdev must be indirect. * * If the segment is referenced by a snapshot, put it on the remap deadlist. * Otherwise, add this segment to the obsolete spacemap. */ void dsl_dataset_block_remapped(dsl_dataset_t *ds, uint64_t vdev, uint64_t offset, uint64_t size, uint64_t birth, dmu_tx_t *tx) { spa_t *spa = ds->ds_dir->dd_pool->dp_spa; ASSERT(dmu_tx_is_syncing(tx)); ASSERT(birth <= tx->tx_txg); ASSERT(!ds->ds_is_snapshot); if (birth > dsl_dataset_phys(ds)->ds_prev_snap_txg) { spa_vdev_indirect_mark_obsolete(spa, vdev, offset, size, tx); } else { blkptr_t fakebp; dva_t *dva = &fakebp.blk_dva[0]; ASSERT(ds != NULL); mutex_enter(&ds->ds_remap_deadlist_lock); if (!dsl_dataset_remap_deadlist_exists(ds)) { dsl_dataset_create_remap_deadlist(ds, tx); } mutex_exit(&ds->ds_remap_deadlist_lock); BP_ZERO(&fakebp); fakebp.blk_birth = birth; DVA_SET_VDEV(dva, vdev); DVA_SET_OFFSET(dva, offset); DVA_SET_ASIZE(dva, size); dsl_deadlist_insert(&ds->ds_remap_deadlist, &fakebp, B_FALSE, tx); } } int dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx, boolean_t async) { spa_t *spa = dmu_tx_pool(tx)->dp_spa; int used = bp_get_dsize_sync(spa, bp); int compressed = BP_GET_PSIZE(bp); int uncompressed = BP_GET_UCSIZE(bp); if (BP_IS_HOLE(bp) || BP_IS_REDACTED(bp)) return (0); ASSERT(dmu_tx_is_syncing(tx)); ASSERT(bp->blk_birth <= tx->tx_txg); if (ds == NULL) { dsl_free(tx->tx_pool, tx->tx_txg, bp); dsl_pool_mos_diduse_space(tx->tx_pool, -used, -compressed, -uncompressed); return (used); } ASSERT3P(tx->tx_pool, ==, ds->ds_dir->dd_pool); ASSERT(!ds->ds_is_snapshot); dmu_buf_will_dirty(ds->ds_dbuf, tx); /* * Track block for livelist, but ignore embedded blocks because * they do not need to be freed. */ if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) && bp->blk_birth > ds->ds_dir->dd_origin_txg && !(BP_IS_EMBEDDED(bp))) { ASSERT(dsl_dir_is_clone(ds->ds_dir)); ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_LIVELIST)); bplist_append(&ds->ds_dir->dd_pending_frees, bp); } if (bp->blk_birth > dsl_dataset_phys(ds)->ds_prev_snap_txg) { int64_t delta; dprintf_bp(bp, "freeing ds=%llu", (u_longlong_t)ds->ds_object); dsl_free(tx->tx_pool, tx->tx_txg, bp); mutex_enter(&ds->ds_lock); ASSERT(dsl_dataset_phys(ds)->ds_unique_bytes >= used || !DS_UNIQUE_IS_ACCURATE(ds)); delta = parent_delta(ds, -used); dsl_dataset_phys(ds)->ds_unique_bytes -= used; mutex_exit(&ds->ds_lock); dsl_dir_diduse_transfer_space(ds->ds_dir, delta, -compressed, -uncompressed, -used, DD_USED_REFRSRV, DD_USED_HEAD, tx); } else { dprintf_bp(bp, "putting on dead list: %s", ""); if (async) { /* * We are here as part of zio's write done callback, * which means we're a zio interrupt thread. We can't * call dsl_deadlist_insert() now because it may block * waiting for I/O. Instead, put bp on the deferred * queue and let dsl_pool_sync() finish the job. */ bplist_append(&ds->ds_pending_deadlist, bp); } else { dsl_deadlist_insert(&ds->ds_deadlist, bp, B_FALSE, tx); } ASSERT3U(ds->ds_prev->ds_object, ==, dsl_dataset_phys(ds)->ds_prev_snap_obj); ASSERT(dsl_dataset_phys(ds->ds_prev)->ds_num_children > 0); /* if (bp->blk_birth > prev prev snap txg) prev unique += bs */ if (dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj == ds->ds_object && bp->blk_birth > dsl_dataset_phys(ds->ds_prev)->ds_prev_snap_txg) { dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); mutex_enter(&ds->ds_prev->ds_lock); dsl_dataset_phys(ds->ds_prev)->ds_unique_bytes += used; mutex_exit(&ds->ds_prev->ds_lock); } if (bp->blk_birth > ds->ds_dir->dd_origin_txg) { dsl_dir_transfer_space(ds->ds_dir, used, DD_USED_HEAD, DD_USED_SNAP, tx); } } dsl_bookmark_block_killed(ds, bp, tx); mutex_enter(&ds->ds_lock); ASSERT3U(dsl_dataset_phys(ds)->ds_referenced_bytes, >=, used); dsl_dataset_phys(ds)->ds_referenced_bytes -= used; ASSERT3U(dsl_dataset_phys(ds)->ds_compressed_bytes, >=, compressed); dsl_dataset_phys(ds)->ds_compressed_bytes -= compressed; ASSERT3U(dsl_dataset_phys(ds)->ds_uncompressed_bytes, >=, uncompressed); dsl_dataset_phys(ds)->ds_uncompressed_bytes -= uncompressed; mutex_exit(&ds->ds_lock); return (used); } struct feature_type_uint64_array_arg { uint64_t length; uint64_t *array; }; static void unload_zfeature(dsl_dataset_t *ds, spa_feature_t f) { switch (spa_feature_table[f].fi_type) { case ZFEATURE_TYPE_BOOLEAN: break; case ZFEATURE_TYPE_UINT64_ARRAY: { struct feature_type_uint64_array_arg *ftuaa = ds->ds_feature[f]; kmem_free(ftuaa->array, ftuaa->length * sizeof (uint64_t)); kmem_free(ftuaa, sizeof (*ftuaa)); break; } default: panic("Invalid zfeature type %d", spa_feature_table[f].fi_type); } } static int load_zfeature(objset_t *mos, dsl_dataset_t *ds, spa_feature_t f) { int err = 0; switch (spa_feature_table[f].fi_type) { case ZFEATURE_TYPE_BOOLEAN: err = zap_contains(mos, ds->ds_object, spa_feature_table[f].fi_guid); if (err == 0) { ds->ds_feature[f] = (void *)B_TRUE; } else { ASSERT3U(err, ==, ENOENT); err = 0; } break; case ZFEATURE_TYPE_UINT64_ARRAY: { uint64_t int_size, num_int; uint64_t *data; err = zap_length(mos, ds->ds_object, spa_feature_table[f].fi_guid, &int_size, &num_int); if (err != 0) { ASSERT3U(err, ==, ENOENT); err = 0; break; } ASSERT3U(int_size, ==, sizeof (uint64_t)); data = kmem_alloc(int_size * num_int, KM_SLEEP); VERIFY0(zap_lookup(mos, ds->ds_object, spa_feature_table[f].fi_guid, int_size, num_int, data)); struct feature_type_uint64_array_arg *ftuaa = kmem_alloc(sizeof (*ftuaa), KM_SLEEP); ftuaa->length = num_int; ftuaa->array = data; ds->ds_feature[f] = ftuaa; break; } default: panic("Invalid zfeature type %d", spa_feature_table[f].fi_type); } return (err); } /* * We have to release the fsid synchronously or we risk that a subsequent * mount of the same dataset will fail to unique_insert the fsid. This * failure would manifest itself as the fsid of this dataset changing * between mounts which makes NFS clients quite unhappy. */ static void dsl_dataset_evict_sync(void *dbu) { dsl_dataset_t *ds = dbu; ASSERT(ds->ds_owner == NULL); unique_remove(ds->ds_fsid_guid); } static void dsl_dataset_evict_async(void *dbu) { dsl_dataset_t *ds = dbu; ASSERT(ds->ds_owner == NULL); ds->ds_dbuf = NULL; if (ds->ds_objset != NULL) dmu_objset_evict(ds->ds_objset); if (ds->ds_prev) { dsl_dataset_rele(ds->ds_prev, ds); ds->ds_prev = NULL; } dsl_bookmark_fini_ds(ds); bplist_destroy(&ds->ds_pending_deadlist); if (dsl_deadlist_is_open(&ds->ds_deadlist)) dsl_deadlist_close(&ds->ds_deadlist); if (dsl_deadlist_is_open(&ds->ds_remap_deadlist)) dsl_deadlist_close(&ds->ds_remap_deadlist); if (ds->ds_dir) dsl_dir_async_rele(ds->ds_dir, ds); ASSERT(!list_link_active(&ds->ds_synced_link)); for (spa_feature_t f = 0; f < SPA_FEATURES; f++) { if (dsl_dataset_feature_is_active(ds, f)) unload_zfeature(ds, f); } list_destroy(&ds->ds_prop_cbs); mutex_destroy(&ds->ds_lock); mutex_destroy(&ds->ds_opening_lock); mutex_destroy(&ds->ds_sendstream_lock); mutex_destroy(&ds->ds_remap_deadlist_lock); zfs_refcount_destroy(&ds->ds_longholds); rrw_destroy(&ds->ds_bp_rwlock); kmem_free(ds, sizeof (dsl_dataset_t)); } int dsl_dataset_get_snapname(dsl_dataset_t *ds) { dsl_dataset_phys_t *headphys; int err; dmu_buf_t *headdbuf; dsl_pool_t *dp = ds->ds_dir->dd_pool; objset_t *mos = dp->dp_meta_objset; if (ds->ds_snapname[0]) return (0); if (dsl_dataset_phys(ds)->ds_next_snap_obj == 0) return (0); err = dmu_bonus_hold(mos, dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj, FTAG, &headdbuf); if (err != 0) return (err); headphys = headdbuf->db_data; err = zap_value_search(dp->dp_meta_objset, headphys->ds_snapnames_zapobj, ds->ds_object, 0, ds->ds_snapname); if (err != 0 && zfs_recover == B_TRUE) { err = 0; (void) snprintf(ds->ds_snapname, sizeof (ds->ds_snapname), "SNAPOBJ=%llu-ERR=%d", (unsigned long long)ds->ds_object, err); } dmu_buf_rele(headdbuf, FTAG); return (err); } int dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name, uint64_t *value) { objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; uint64_t snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj; matchtype_t mt = 0; int err; if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET) mt = MT_NORMALIZE; err = zap_lookup_norm(mos, snapobj, name, 8, 1, value, mt, NULL, 0, NULL); if (err == ENOTSUP && (mt & MT_NORMALIZE)) err = zap_lookup(mos, snapobj, name, 8, 1, value); return (err); } int dsl_dataset_snap_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx, boolean_t adj_cnt) { objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; uint64_t snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj; matchtype_t mt = 0; int err; dsl_dir_snap_cmtime_update(ds->ds_dir, tx); if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET) mt = MT_NORMALIZE; err = zap_remove_norm(mos, snapobj, name, mt, tx); if (err == ENOTSUP && (mt & MT_NORMALIZE)) err = zap_remove(mos, snapobj, name, tx); if (err == 0 && adj_cnt) dsl_fs_ss_count_adjust(ds->ds_dir, -1, DD_FIELD_SNAPSHOT_COUNT, tx); return (err); } boolean_t dsl_dataset_try_add_ref(dsl_pool_t *dp, dsl_dataset_t *ds, const void *tag) { dmu_buf_t *dbuf = ds->ds_dbuf; boolean_t result = B_FALSE; if (dbuf != NULL && dmu_buf_try_add_ref(dbuf, dp->dp_meta_objset, ds->ds_object, DMU_BONUS_BLKID, tag)) { if (ds == dmu_buf_get_user(dbuf)) result = B_TRUE; else dmu_buf_rele(dbuf, tag); } return (result); } int dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, const void *tag, dsl_dataset_t **dsp) { objset_t *mos = dp->dp_meta_objset; dmu_buf_t *dbuf; dsl_dataset_t *ds; int err; dmu_object_info_t doi; ASSERT(dsl_pool_config_held(dp)); err = dmu_bonus_hold(mos, dsobj, tag, &dbuf); if (err != 0) return (err); /* Make sure dsobj has the correct object type. */ dmu_object_info_from_db(dbuf, &doi); if (doi.doi_bonus_type != DMU_OT_DSL_DATASET) { dmu_buf_rele(dbuf, tag); return (SET_ERROR(EINVAL)); } ds = dmu_buf_get_user(dbuf); if (ds == NULL) { dsl_dataset_t *winner = NULL; ds = kmem_zalloc(sizeof (dsl_dataset_t), KM_SLEEP); ds->ds_dbuf = dbuf; ds->ds_object = dsobj; ds->ds_is_snapshot = dsl_dataset_phys(ds)->ds_num_children != 0; list_link_init(&ds->ds_synced_link); err = dsl_dir_hold_obj(dp, dsl_dataset_phys(ds)->ds_dir_obj, NULL, ds, &ds->ds_dir); if (err != 0) { kmem_free(ds, sizeof (dsl_dataset_t)); dmu_buf_rele(dbuf, tag); return (err); } mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&ds->ds_opening_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&ds->ds_sendstream_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&ds->ds_remap_deadlist_lock, NULL, MUTEX_DEFAULT, NULL); rrw_init(&ds->ds_bp_rwlock, B_FALSE); zfs_refcount_create(&ds->ds_longholds); bplist_create(&ds->ds_pending_deadlist); list_create(&ds->ds_sendstreams, sizeof (dmu_sendstatus_t), offsetof(dmu_sendstatus_t, dss_link)); list_create(&ds->ds_prop_cbs, sizeof (dsl_prop_cb_record_t), offsetof(dsl_prop_cb_record_t, cbr_ds_node)); if (doi.doi_type == DMU_OTN_ZAP_METADATA) { spa_feature_t f; for (f = 0; f < SPA_FEATURES; f++) { if (!(spa_feature_table[f].fi_flags & ZFEATURE_FLAG_PER_DATASET)) continue; err = load_zfeature(mos, ds, f); } } if (!ds->ds_is_snapshot) { ds->ds_snapname[0] = '\0'; if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) { err = dsl_dataset_hold_obj(dp, dsl_dataset_phys(ds)->ds_prev_snap_obj, ds, &ds->ds_prev); } if (err != 0) goto after_dsl_bookmark_fini; err = dsl_bookmark_init_ds(ds); } else { if (zfs_flags & ZFS_DEBUG_SNAPNAMES) err = dsl_dataset_get_snapname(ds); if (err == 0 && dsl_dataset_phys(ds)->ds_userrefs_obj != 0) { err = zap_count( ds->ds_dir->dd_pool->dp_meta_objset, dsl_dataset_phys(ds)->ds_userrefs_obj, &ds->ds_userrefs); } } if (err == 0 && !ds->ds_is_snapshot) { err = dsl_prop_get_int_ds(ds, zfs_prop_to_name(ZFS_PROP_REFRESERVATION), &ds->ds_reserved); if (err == 0) { err = dsl_prop_get_int_ds(ds, zfs_prop_to_name(ZFS_PROP_REFQUOTA), &ds->ds_quota); } } else { ds->ds_reserved = ds->ds_quota = 0; } if (err == 0 && ds->ds_dir->dd_crypto_obj != 0 && ds->ds_is_snapshot && zap_contains(mos, dsobj, DS_FIELD_IVSET_GUID) != 0) { dp->dp_spa->spa_errata = ZPOOL_ERRATA_ZOL_8308_ENCRYPTION; } dsl_deadlist_open(&ds->ds_deadlist, mos, dsl_dataset_phys(ds)->ds_deadlist_obj); uint64_t remap_deadlist_obj = dsl_dataset_get_remap_deadlist_object(ds); if (remap_deadlist_obj != 0) { dsl_deadlist_open(&ds->ds_remap_deadlist, mos, remap_deadlist_obj); } dmu_buf_init_user(&ds->ds_dbu, dsl_dataset_evict_sync, dsl_dataset_evict_async, &ds->ds_dbuf); if (err == 0) winner = dmu_buf_set_user_ie(dbuf, &ds->ds_dbu); if (err != 0 || winner != NULL) { dsl_deadlist_close(&ds->ds_deadlist); if (dsl_deadlist_is_open(&ds->ds_remap_deadlist)) dsl_deadlist_close(&ds->ds_remap_deadlist); dsl_bookmark_fini_ds(ds); after_dsl_bookmark_fini: if (ds->ds_prev) dsl_dataset_rele(ds->ds_prev, ds); dsl_dir_rele(ds->ds_dir, ds); for (spa_feature_t f = 0; f < SPA_FEATURES; f++) { if (dsl_dataset_feature_is_active(ds, f)) unload_zfeature(ds, f); } list_destroy(&ds->ds_prop_cbs); list_destroy(&ds->ds_sendstreams); bplist_destroy(&ds->ds_pending_deadlist); mutex_destroy(&ds->ds_lock); mutex_destroy(&ds->ds_opening_lock); mutex_destroy(&ds->ds_sendstream_lock); mutex_destroy(&ds->ds_remap_deadlist_lock); zfs_refcount_destroy(&ds->ds_longholds); rrw_destroy(&ds->ds_bp_rwlock); kmem_free(ds, sizeof (dsl_dataset_t)); if (err != 0) { dmu_buf_rele(dbuf, tag); return (err); } ds = winner; } else { ds->ds_fsid_guid = unique_insert(dsl_dataset_phys(ds)->ds_fsid_guid); if (ds->ds_fsid_guid != dsl_dataset_phys(ds)->ds_fsid_guid) { zfs_dbgmsg("ds_fsid_guid changed from " "%llx to %llx for pool %s dataset id %llu", (long long) dsl_dataset_phys(ds)->ds_fsid_guid, (long long)ds->ds_fsid_guid, spa_name(dp->dp_spa), (u_longlong_t)dsobj); } } } ASSERT3P(ds->ds_dbuf, ==, dbuf); ASSERT3P(dsl_dataset_phys(ds), ==, dbuf->db_data); ASSERT(dsl_dataset_phys(ds)->ds_prev_snap_obj != 0 || spa_version(dp->dp_spa) < SPA_VERSION_ORIGIN || dp->dp_origin_snap == NULL || ds == dp->dp_origin_snap); *dsp = ds; return (0); } int dsl_dataset_create_key_mapping(dsl_dataset_t *ds) { dsl_dir_t *dd = ds->ds_dir; if (dd->dd_crypto_obj == 0) return (0); return (spa_keystore_create_mapping(dd->dd_pool->dp_spa, ds, ds, &ds->ds_key_mapping)); } int dsl_dataset_hold_obj_flags(dsl_pool_t *dp, uint64_t dsobj, ds_hold_flags_t flags, const void *tag, dsl_dataset_t **dsp) { int err; err = dsl_dataset_hold_obj(dp, dsobj, tag, dsp); if (err != 0) return (err); ASSERT3P(*dsp, !=, NULL); if (flags & DS_HOLD_FLAG_DECRYPT) { err = dsl_dataset_create_key_mapping(*dsp); if (err != 0) dsl_dataset_rele(*dsp, tag); } return (err); } int dsl_dataset_hold_flags(dsl_pool_t *dp, const char *name, ds_hold_flags_t flags, const void *tag, dsl_dataset_t **dsp) { dsl_dir_t *dd; const char *snapname; uint64_t obj; int err = 0; dsl_dataset_t *ds; err = dsl_dir_hold(dp, name, FTAG, &dd, &snapname); if (err != 0) return (err); ASSERT(dsl_pool_config_held(dp)); obj = dsl_dir_phys(dd)->dd_head_dataset_obj; if (obj != 0) err = dsl_dataset_hold_obj_flags(dp, obj, flags, tag, &ds); else err = SET_ERROR(ENOENT); /* we may be looking for a snapshot */ if (err == 0 && snapname != NULL) { dsl_dataset_t *snap_ds; if (*snapname++ != '@') { dsl_dataset_rele_flags(ds, flags, tag); dsl_dir_rele(dd, FTAG); return (SET_ERROR(ENOENT)); } dprintf("looking for snapshot '%s'\n", snapname); err = dsl_dataset_snap_lookup(ds, snapname, &obj); if (err == 0) { err = dsl_dataset_hold_obj_flags(dp, obj, flags, tag, &snap_ds); } dsl_dataset_rele_flags(ds, flags, tag); if (err == 0) { mutex_enter(&snap_ds->ds_lock); if (snap_ds->ds_snapname[0] == 0) (void) strlcpy(snap_ds->ds_snapname, snapname, sizeof (snap_ds->ds_snapname)); mutex_exit(&snap_ds->ds_lock); ds = snap_ds; } } if (err == 0) *dsp = ds; dsl_dir_rele(dd, FTAG); return (err); } int dsl_dataset_hold(dsl_pool_t *dp, const char *name, const void *tag, dsl_dataset_t **dsp) { return (dsl_dataset_hold_flags(dp, name, 0, tag, dsp)); } static int dsl_dataset_own_obj_impl(dsl_pool_t *dp, uint64_t dsobj, ds_hold_flags_t flags, const void *tag, boolean_t override, dsl_dataset_t **dsp) { int err = dsl_dataset_hold_obj_flags(dp, dsobj, flags, tag, dsp); if (err != 0) return (err); if (!dsl_dataset_tryown(*dsp, tag, override)) { dsl_dataset_rele_flags(*dsp, flags, tag); *dsp = NULL; return (SET_ERROR(EBUSY)); } return (0); } int dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj, ds_hold_flags_t flags, const void *tag, dsl_dataset_t **dsp) { return (dsl_dataset_own_obj_impl(dp, dsobj, flags, tag, B_FALSE, dsp)); } int dsl_dataset_own_obj_force(dsl_pool_t *dp, uint64_t dsobj, ds_hold_flags_t flags, const void *tag, dsl_dataset_t **dsp) { return (dsl_dataset_own_obj_impl(dp, dsobj, flags, tag, B_TRUE, dsp)); } static int dsl_dataset_own_impl(dsl_pool_t *dp, const char *name, ds_hold_flags_t flags, const void *tag, boolean_t override, dsl_dataset_t **dsp) { int err = dsl_dataset_hold_flags(dp, name, flags, tag, dsp); if (err != 0) return (err); if (!dsl_dataset_tryown(*dsp, tag, override)) { dsl_dataset_rele_flags(*dsp, flags, tag); return (SET_ERROR(EBUSY)); } return (0); } int dsl_dataset_own_force(dsl_pool_t *dp, const char *name, ds_hold_flags_t flags, const void *tag, dsl_dataset_t **dsp) { return (dsl_dataset_own_impl(dp, name, flags, tag, B_TRUE, dsp)); } int dsl_dataset_own(dsl_pool_t *dp, const char *name, ds_hold_flags_t flags, const void *tag, dsl_dataset_t **dsp) { return (dsl_dataset_own_impl(dp, name, flags, tag, B_FALSE, dsp)); } /* * See the comment above dsl_pool_hold() for details. In summary, a long * hold is used to prevent destruction of a dataset while the pool hold * is dropped, allowing other concurrent operations (e.g. spa_sync()). * * The dataset and pool must be held when this function is called. After it * is called, the pool hold may be released while the dataset is still held * and accessed. */ void dsl_dataset_long_hold(dsl_dataset_t *ds, const void *tag) { ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool)); (void) zfs_refcount_add(&ds->ds_longholds, tag); } void dsl_dataset_long_rele(dsl_dataset_t *ds, const void *tag) { (void) zfs_refcount_remove(&ds->ds_longholds, tag); } /* Return B_TRUE if there are any long holds on this dataset. */ boolean_t dsl_dataset_long_held(dsl_dataset_t *ds) { return (!zfs_refcount_is_zero(&ds->ds_longholds)); } void dsl_dataset_name(dsl_dataset_t *ds, char *name) { if (ds == NULL) { (void) strlcpy(name, "mos", ZFS_MAX_DATASET_NAME_LEN); } else { dsl_dir_name(ds->ds_dir, name); VERIFY0(dsl_dataset_get_snapname(ds)); if (ds->ds_snapname[0]) { VERIFY3U(strlcat(name, "@", ZFS_MAX_DATASET_NAME_LEN), <, ZFS_MAX_DATASET_NAME_LEN); /* * We use a "recursive" mutex so that we * can call dprintf_ds() with ds_lock held. */ if (!MUTEX_HELD(&ds->ds_lock)) { mutex_enter(&ds->ds_lock); VERIFY3U(strlcat(name, ds->ds_snapname, ZFS_MAX_DATASET_NAME_LEN), <, ZFS_MAX_DATASET_NAME_LEN); mutex_exit(&ds->ds_lock); } else { VERIFY3U(strlcat(name, ds->ds_snapname, ZFS_MAX_DATASET_NAME_LEN), <, ZFS_MAX_DATASET_NAME_LEN); } } } } int dsl_dataset_namelen(dsl_dataset_t *ds) { VERIFY0(dsl_dataset_get_snapname(ds)); mutex_enter(&ds->ds_lock); int len = strlen(ds->ds_snapname); mutex_exit(&ds->ds_lock); /* add '@' if ds is a snap */ if (len > 0) len++; len += dsl_dir_namelen(ds->ds_dir); return (len); } void dsl_dataset_rele(dsl_dataset_t *ds, const void *tag) { dmu_buf_rele(ds->ds_dbuf, tag); } void dsl_dataset_remove_key_mapping(dsl_dataset_t *ds) { dsl_dir_t *dd = ds->ds_dir; if (dd == NULL || dd->dd_crypto_obj == 0) return; (void) spa_keystore_remove_mapping(dd->dd_pool->dp_spa, ds->ds_object, ds); } void dsl_dataset_rele_flags(dsl_dataset_t *ds, ds_hold_flags_t flags, const void *tag) { if (flags & DS_HOLD_FLAG_DECRYPT) dsl_dataset_remove_key_mapping(ds); dsl_dataset_rele(ds, tag); } void dsl_dataset_disown(dsl_dataset_t *ds, ds_hold_flags_t flags, const void *tag) { ASSERT3P(ds->ds_owner, ==, tag); ASSERT(ds->ds_dbuf != NULL); mutex_enter(&ds->ds_lock); ds->ds_owner = NULL; mutex_exit(&ds->ds_lock); dsl_dataset_long_rele(ds, tag); dsl_dataset_rele_flags(ds, flags, tag); } boolean_t dsl_dataset_tryown(dsl_dataset_t *ds, const void *tag, boolean_t override) { boolean_t gotit = FALSE; ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool)); mutex_enter(&ds->ds_lock); if (ds->ds_owner == NULL && (override || !(DS_IS_INCONSISTENT(ds) || (dsl_dataset_feature_is_active(ds, SPA_FEATURE_REDACTED_DATASETS) && !zfs_allow_redacted_dataset_mount)))) { ds->ds_owner = tag; dsl_dataset_long_hold(ds, tag); gotit = TRUE; } mutex_exit(&ds->ds_lock); return (gotit); } boolean_t dsl_dataset_has_owner(dsl_dataset_t *ds) { boolean_t rv; mutex_enter(&ds->ds_lock); rv = (ds->ds_owner != NULL); mutex_exit(&ds->ds_lock); return (rv); } static boolean_t zfeature_active(spa_feature_t f, void *arg) { switch (spa_feature_table[f].fi_type) { case ZFEATURE_TYPE_BOOLEAN: { boolean_t val = (boolean_t)(uintptr_t)arg; ASSERT(val == B_FALSE || val == B_TRUE); return (val); } case ZFEATURE_TYPE_UINT64_ARRAY: /* * In this case, arg is a uint64_t array. The feature is active * if the array is non-null. */ return (arg != NULL); default: panic("Invalid zfeature type %d", spa_feature_table[f].fi_type); return (B_FALSE); } } boolean_t dsl_dataset_feature_is_active(dsl_dataset_t *ds, spa_feature_t f) { return (zfeature_active(f, ds->ds_feature[f])); } /* * The buffers passed out by this function are references to internal buffers; * they should not be freed by callers of this function, and they should not be * used after the dataset has been released. */ boolean_t dsl_dataset_get_uint64_array_feature(dsl_dataset_t *ds, spa_feature_t f, uint64_t *outlength, uint64_t **outp) { VERIFY(spa_feature_table[f].fi_type & ZFEATURE_TYPE_UINT64_ARRAY); if (!dsl_dataset_feature_is_active(ds, f)) { return (B_FALSE); } struct feature_type_uint64_array_arg *ftuaa = ds->ds_feature[f]; *outp = ftuaa->array; *outlength = ftuaa->length; return (B_TRUE); } void dsl_dataset_activate_feature(uint64_t dsobj, spa_feature_t f, void *arg, dmu_tx_t *tx) { spa_t *spa = dmu_tx_pool(tx)->dp_spa; objset_t *mos = dmu_tx_pool(tx)->dp_meta_objset; uint64_t zero = 0; VERIFY(spa_feature_table[f].fi_flags & ZFEATURE_FLAG_PER_DATASET); spa_feature_incr(spa, f, tx); dmu_object_zapify(mos, dsobj, DMU_OT_DSL_DATASET, tx); switch (spa_feature_table[f].fi_type) { case ZFEATURE_TYPE_BOOLEAN: ASSERT3S((boolean_t)(uintptr_t)arg, ==, B_TRUE); VERIFY0(zap_add(mos, dsobj, spa_feature_table[f].fi_guid, sizeof (zero), 1, &zero, tx)); break; case ZFEATURE_TYPE_UINT64_ARRAY: { struct feature_type_uint64_array_arg *ftuaa = arg; VERIFY0(zap_add(mos, dsobj, spa_feature_table[f].fi_guid, sizeof (uint64_t), ftuaa->length, ftuaa->array, tx)); break; } default: panic("Invalid zfeature type %d", spa_feature_table[f].fi_type); } } static void dsl_dataset_deactivate_feature_impl(dsl_dataset_t *ds, spa_feature_t f, dmu_tx_t *tx) { spa_t *spa = dmu_tx_pool(tx)->dp_spa; objset_t *mos = dmu_tx_pool(tx)->dp_meta_objset; uint64_t dsobj = ds->ds_object; VERIFY(spa_feature_table[f].fi_flags & ZFEATURE_FLAG_PER_DATASET); VERIFY0(zap_remove(mos, dsobj, spa_feature_table[f].fi_guid, tx)); spa_feature_decr(spa, f, tx); ds->ds_feature[f] = NULL; } void dsl_dataset_deactivate_feature(dsl_dataset_t *ds, spa_feature_t f, dmu_tx_t *tx) { unload_zfeature(ds, f); dsl_dataset_deactivate_feature_impl(ds, f, tx); } uint64_t dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin, dsl_crypto_params_t *dcp, uint64_t flags, dmu_tx_t *tx) { dsl_pool_t *dp = dd->dd_pool; dmu_buf_t *dbuf; dsl_dataset_phys_t *dsphys; uint64_t dsobj; objset_t *mos = dp->dp_meta_objset; if (origin == NULL) origin = dp->dp_origin_snap; ASSERT(origin == NULL || origin->ds_dir->dd_pool == dp); ASSERT(origin == NULL || dsl_dataset_phys(origin)->ds_num_children > 0); ASSERT(dmu_tx_is_syncing(tx)); ASSERT(dsl_dir_phys(dd)->dd_head_dataset_obj == 0); dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0, DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx); VERIFY0(dmu_bonus_hold(mos, dsobj, FTAG, &dbuf)); dmu_buf_will_dirty(dbuf, tx); dsphys = dbuf->db_data; memset(dsphys, 0, sizeof (dsl_dataset_phys_t)); dsphys->ds_dir_obj = dd->dd_object; dsphys->ds_flags = flags; dsphys->ds_fsid_guid = unique_create(); (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid, sizeof (dsphys->ds_guid)); dsphys->ds_snapnames_zapobj = zap_create_norm(mos, U8_TEXTPREP_TOUPPER, DMU_OT_DSL_DS_SNAP_MAP, DMU_OT_NONE, 0, tx); dsphys->ds_creation_time = gethrestime_sec(); dsphys->ds_creation_txg = tx->tx_txg == TXG_INITIAL ? 1 : tx->tx_txg; if (origin == NULL) { dsphys->ds_deadlist_obj = dsl_deadlist_alloc(mos, tx); } else { dsl_dataset_t *ohds; /* head of the origin snapshot */ dsphys->ds_prev_snap_obj = origin->ds_object; dsphys->ds_prev_snap_txg = dsl_dataset_phys(origin)->ds_creation_txg; dsphys->ds_referenced_bytes = dsl_dataset_phys(origin)->ds_referenced_bytes; dsphys->ds_compressed_bytes = dsl_dataset_phys(origin)->ds_compressed_bytes; dsphys->ds_uncompressed_bytes = dsl_dataset_phys(origin)->ds_uncompressed_bytes; rrw_enter(&origin->ds_bp_rwlock, RW_READER, FTAG); dsphys->ds_bp = dsl_dataset_phys(origin)->ds_bp; rrw_exit(&origin->ds_bp_rwlock, FTAG); /* * Inherit flags that describe the dataset's contents * (INCONSISTENT) or properties (Case Insensitive). */ dsphys->ds_flags |= dsl_dataset_phys(origin)->ds_flags & (DS_FLAG_INCONSISTENT | DS_FLAG_CI_DATASET); for (spa_feature_t f = 0; f < SPA_FEATURES; f++) { if (zfeature_active(f, origin->ds_feature[f])) { dsl_dataset_activate_feature(dsobj, f, origin->ds_feature[f], tx); } } dmu_buf_will_dirty(origin->ds_dbuf, tx); dsl_dataset_phys(origin)->ds_num_children++; VERIFY0(dsl_dataset_hold_obj(dp, dsl_dir_phys(origin->ds_dir)->dd_head_dataset_obj, FTAG, &ohds)); dsphys->ds_deadlist_obj = dsl_deadlist_clone(&ohds->ds_deadlist, dsphys->ds_prev_snap_txg, dsphys->ds_prev_snap_obj, tx); dsl_dataset_rele(ohds, FTAG); if (spa_version(dp->dp_spa) >= SPA_VERSION_NEXT_CLONES) { if (dsl_dataset_phys(origin)->ds_next_clones_obj == 0) { dsl_dataset_phys(origin)->ds_next_clones_obj = zap_create(mos, DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx); } VERIFY0(zap_add_int(mos, dsl_dataset_phys(origin)->ds_next_clones_obj, dsobj, tx)); } dmu_buf_will_dirty(dd->dd_dbuf, tx); dsl_dir_phys(dd)->dd_origin_obj = origin->ds_object; if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { if (dsl_dir_phys(origin->ds_dir)->dd_clones == 0) { dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx); dsl_dir_phys(origin->ds_dir)->dd_clones = zap_create(mos, DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx); } VERIFY0(zap_add_int(mos, dsl_dir_phys(origin->ds_dir)->dd_clones, dsobj, tx)); } } /* handle encryption */ dsl_dataset_create_crypt_sync(dsobj, dd, origin, dcp, tx); if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE) dsphys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; dmu_buf_rele(dbuf, FTAG); dmu_buf_will_dirty(dd->dd_dbuf, tx); dsl_dir_phys(dd)->dd_head_dataset_obj = dsobj; return (dsobj); } static void dsl_dataset_zero_zil(dsl_dataset_t *ds, dmu_tx_t *tx) { objset_t *os; VERIFY0(dmu_objset_from_ds(ds, &os)); if (memcmp(&os->os_zil_header, &zero_zil, sizeof (zero_zil)) != 0) { dsl_pool_t *dp = ds->ds_dir->dd_pool; zio_t *zio; memset(&os->os_zil_header, 0, sizeof (os->os_zil_header)); if (os->os_encrypted) os->os_next_write_raw[tx->tx_txg & TXG_MASK] = B_TRUE; zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); dsl_dataset_sync(ds, zio, tx); VERIFY0(zio_wait(zio)); /* dsl_dataset_sync_done will drop this reference. */ dmu_buf_add_ref(ds->ds_dbuf, ds); dsl_dataset_sync_done(ds, tx); } } uint64_t dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname, dsl_dataset_t *origin, uint64_t flags, cred_t *cr, dsl_crypto_params_t *dcp, dmu_tx_t *tx) { dsl_pool_t *dp = pdd->dd_pool; uint64_t dsobj, ddobj; dsl_dir_t *dd; ASSERT(dmu_tx_is_syncing(tx)); ASSERT(lastname[0] != '@'); /* * Filesystems will eventually have their origin set to dp_origin_snap, * but that's taken care of in dsl_dataset_create_sync_dd. When * creating a filesystem, this function is called with origin equal to * NULL. */ if (origin != NULL) ASSERT3P(origin, !=, dp->dp_origin_snap); ddobj = dsl_dir_create_sync(dp, pdd, lastname, tx); VERIFY0(dsl_dir_hold_obj(dp, ddobj, lastname, FTAG, &dd)); dsobj = dsl_dataset_create_sync_dd(dd, origin, dcp, flags & ~DS_CREATE_FLAG_NODIRTY, tx); dsl_deleg_set_create_perms(dd, tx, cr); /* * If we are creating a clone and the livelist feature is enabled, * add the entry DD_FIELD_LIVELIST to ZAP. */ if (origin != NULL && spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LIVELIST)) { objset_t *mos = dd->dd_pool->dp_meta_objset; dsl_dir_zapify(dd, tx); uint64_t obj = dsl_deadlist_alloc(mos, tx); VERIFY0(zap_add(mos, dd->dd_object, DD_FIELD_LIVELIST, sizeof (uint64_t), 1, &obj, tx)); spa_feature_incr(dp->dp_spa, SPA_FEATURE_LIVELIST, tx); } /* * Since we're creating a new node we know it's a leaf, so we can * initialize the counts if the limit feature is active. */ if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)) { uint64_t cnt = 0; objset_t *os = dd->dd_pool->dp_meta_objset; dsl_dir_zapify(dd, tx); VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT, sizeof (cnt), 1, &cnt, tx)); VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT, sizeof (cnt), 1, &cnt, tx)); } dsl_dir_rele(dd, FTAG); /* * If we are creating a clone, make sure we zero out any stale * data from the origin snapshots zil header. */ if (origin != NULL && !(flags & DS_CREATE_FLAG_NODIRTY)) { dsl_dataset_t *ds; VERIFY0(dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); dsl_dataset_zero_zil(ds, tx); dsl_dataset_rele(ds, FTAG); } return (dsobj); } /* * The unique space in the head dataset can be calculated by subtracting * the space used in the most recent snapshot, that is still being used * in this file system, from the space currently in use. To figure out * the space in the most recent snapshot still in use, we need to take * the total space used in the snapshot and subtract out the space that * has been freed up since the snapshot was taken. */ void dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds) { uint64_t mrs_used; uint64_t dlused, dlcomp, dluncomp; ASSERT(!ds->ds_is_snapshot); if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) mrs_used = dsl_dataset_phys(ds->ds_prev)->ds_referenced_bytes; else mrs_used = 0; dsl_deadlist_space(&ds->ds_deadlist, &dlused, &dlcomp, &dluncomp); ASSERT3U(dlused, <=, mrs_used); dsl_dataset_phys(ds)->ds_unique_bytes = dsl_dataset_phys(ds)->ds_referenced_bytes - (mrs_used - dlused); if (spa_version(ds->ds_dir->dd_pool->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE) dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; } void dsl_dataset_remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj, dmu_tx_t *tx) { objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; uint64_t count __maybe_unused; int err; ASSERT(dsl_dataset_phys(ds)->ds_num_children >= 2); err = zap_remove_int(mos, dsl_dataset_phys(ds)->ds_next_clones_obj, obj, tx); /* * The err should not be ENOENT, but a bug in a previous version * of the code could cause upgrade_clones_cb() to not set * ds_next_snap_obj when it should, leading to a missing entry. * If we knew that the pool was created after * SPA_VERSION_NEXT_CLONES, we could assert that it isn't * ENOENT. However, at least we can check that we don't have * too many entries in the next_clones_obj even after failing to * remove this one. */ if (err != ENOENT) VERIFY0(err); ASSERT0(zap_count(mos, dsl_dataset_phys(ds)->ds_next_clones_obj, &count)); ASSERT3U(count, <=, dsl_dataset_phys(ds)->ds_num_children - 2); } blkptr_t * dsl_dataset_get_blkptr(dsl_dataset_t *ds) { return (&dsl_dataset_phys(ds)->ds_bp); } spa_t * dsl_dataset_get_spa(dsl_dataset_t *ds) { return (ds->ds_dir->dd_pool->dp_spa); } void dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx) { dsl_pool_t *dp; if (ds == NULL) /* this is the meta-objset */ return; ASSERT(ds->ds_objset != NULL); if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0) panic("dirtying snapshot!"); /* Must not dirty a dataset in the same txg where it got snapshotted. */ ASSERT3U(tx->tx_txg, >, dsl_dataset_phys(ds)->ds_prev_snap_txg); dp = ds->ds_dir->dd_pool; if (txg_list_add(&dp->dp_dirty_datasets, ds, tx->tx_txg)) { objset_t *os = ds->ds_objset; /* up the hold count until we can be written out */ dmu_buf_add_ref(ds->ds_dbuf, ds); /* if this dataset is encrypted, grab a reference to the DCK */ if (ds->ds_dir->dd_crypto_obj != 0 && !os->os_raw_receive && !os->os_next_write_raw[tx->tx_txg & TXG_MASK]) { ASSERT3P(ds->ds_key_mapping, !=, NULL); key_mapping_add_ref(ds->ds_key_mapping, ds); } } } static int dsl_dataset_snapshot_reserve_space(dsl_dataset_t *ds, dmu_tx_t *tx) { uint64_t asize; if (!dmu_tx_is_syncing(tx)) return (0); /* * If there's an fs-only reservation, any blocks that might become * owned by the snapshot dataset must be accommodated by space * outside of the reservation. */ ASSERT(ds->ds_reserved == 0 || DS_UNIQUE_IS_ACCURATE(ds)); asize = MIN(dsl_dataset_phys(ds)->ds_unique_bytes, ds->ds_reserved); if (asize > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE)) return (SET_ERROR(ENOSPC)); /* * Propagate any reserved space for this snapshot to other * snapshot checks in this sync group. */ if (asize > 0) dsl_dir_willuse_space(ds->ds_dir, asize, tx); return (0); } int dsl_dataset_snapshot_check_impl(dsl_dataset_t *ds, const char *snapname, dmu_tx_t *tx, boolean_t recv, uint64_t cnt, cred_t *cr, proc_t *proc) { int error; uint64_t value; ds->ds_trysnap_txg = tx->tx_txg; if (!dmu_tx_is_syncing(tx)) return (0); /* * We don't allow multiple snapshots of the same txg. If there * is already one, try again. */ if (dsl_dataset_phys(ds)->ds_prev_snap_txg >= tx->tx_txg) return (SET_ERROR(EAGAIN)); /* * Check for conflicting snapshot name. */ error = dsl_dataset_snap_lookup(ds, snapname, &value); if (error == 0) return (SET_ERROR(EEXIST)); if (error != ENOENT) return (error); /* * We don't allow taking snapshots of inconsistent datasets, such as * those into which we are currently receiving. However, if we are * creating this snapshot as part of a receive, this check will be * executed atomically with respect to the completion of the receive * itself but prior to the clearing of DS_FLAG_INCONSISTENT; in this * case we ignore this, knowing it will be fixed up for us shortly in * dmu_recv_end_sync(). */ if (!recv && DS_IS_INCONSISTENT(ds)) return (SET_ERROR(EBUSY)); /* * Skip the check for temporary snapshots or if we have already checked * the counts in dsl_dataset_snapshot_check. This means we really only * check the count here when we're receiving a stream. */ if (cnt != 0 && cr != NULL) { error = dsl_fs_ss_limit_check(ds->ds_dir, cnt, ZFS_PROP_SNAPSHOT_LIMIT, NULL, cr, proc); if (error != 0) return (error); } error = dsl_dataset_snapshot_reserve_space(ds, tx); if (error != 0) return (error); return (0); } int dsl_dataset_snapshot_check(void *arg, dmu_tx_t *tx) { dsl_dataset_snapshot_arg_t *ddsa = arg; dsl_pool_t *dp = dmu_tx_pool(tx); nvpair_t *pair; int rv = 0; /* * Pre-compute how many total new snapshots will be created for each * level in the tree and below. This is needed for validating the * snapshot limit when either taking a recursive snapshot or when * taking multiple snapshots. * * The problem is that the counts are not actually adjusted when * we are checking, only when we finally sync. For a single snapshot, * this is easy, the count will increase by 1 at each node up the tree, * but its more complicated for the recursive/multiple snapshot case. * * The dsl_fs_ss_limit_check function does recursively check the count * at each level up the tree but since it is validating each snapshot * independently we need to be sure that we are validating the complete * count for the entire set of snapshots. We do this by rolling up the * counts for each component of the name into an nvlist and then * checking each of those cases with the aggregated count. * * This approach properly handles not only the recursive snapshot * case (where we get all of those on the ddsa_snaps list) but also * the sibling case (e.g. snapshot a/b and a/c so that we will also * validate the limit on 'a' using a count of 2). * * We validate the snapshot names in the third loop and only report * name errors once. */ if (dmu_tx_is_syncing(tx)) { char *nm; nvlist_t *cnt_track = NULL; cnt_track = fnvlist_alloc(); nm = kmem_alloc(MAXPATHLEN, KM_SLEEP); /* Rollup aggregated counts into the cnt_track list */ for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL); pair != NULL; pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) { char *pdelim; uint64_t val; (void) strlcpy(nm, nvpair_name(pair), MAXPATHLEN); pdelim = strchr(nm, '@'); if (pdelim == NULL) continue; *pdelim = '\0'; do { if (nvlist_lookup_uint64(cnt_track, nm, &val) == 0) { /* update existing entry */ fnvlist_add_uint64(cnt_track, nm, val + 1); } else { /* add to list */ fnvlist_add_uint64(cnt_track, nm, 1); } pdelim = strrchr(nm, '/'); if (pdelim != NULL) *pdelim = '\0'; } while (pdelim != NULL); } kmem_free(nm, MAXPATHLEN); /* Check aggregated counts at each level */ for (pair = nvlist_next_nvpair(cnt_track, NULL); pair != NULL; pair = nvlist_next_nvpair(cnt_track, pair)) { int error = 0; char *name; uint64_t cnt = 0; dsl_dataset_t *ds; name = nvpair_name(pair); cnt = fnvpair_value_uint64(pair); ASSERT(cnt > 0); error = dsl_dataset_hold(dp, name, FTAG, &ds); if (error == 0) { error = dsl_fs_ss_limit_check(ds->ds_dir, cnt, ZFS_PROP_SNAPSHOT_LIMIT, NULL, ddsa->ddsa_cr, ddsa->ddsa_proc); dsl_dataset_rele(ds, FTAG); } if (error != 0) { if (ddsa->ddsa_errors != NULL) fnvlist_add_int32(ddsa->ddsa_errors, name, error); rv = error; /* only report one error for this check */ break; } } nvlist_free(cnt_track); } for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL); pair != NULL; pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) { int error = 0; dsl_dataset_t *ds; char *name, *atp = NULL; char dsname[ZFS_MAX_DATASET_NAME_LEN]; name = nvpair_name(pair); if (strlen(name) >= ZFS_MAX_DATASET_NAME_LEN) error = SET_ERROR(ENAMETOOLONG); if (error == 0) { atp = strchr(name, '@'); if (atp == NULL) error = SET_ERROR(EINVAL); if (error == 0) (void) strlcpy(dsname, name, atp - name + 1); } if (error == 0) error = dsl_dataset_hold(dp, dsname, FTAG, &ds); if (error == 0) { /* passing 0/NULL skips dsl_fs_ss_limit_check */ error = dsl_dataset_snapshot_check_impl(ds, atp + 1, tx, B_FALSE, 0, NULL, NULL); dsl_dataset_rele(ds, FTAG); } if (error != 0) { if (ddsa->ddsa_errors != NULL) { fnvlist_add_int32(ddsa->ddsa_errors, name, error); } rv = error; } } return (rv); } void dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname, dmu_tx_t *tx) { dsl_pool_t *dp = ds->ds_dir->dd_pool; dmu_buf_t *dbuf; dsl_dataset_phys_t *dsphys; uint64_t dsobj, crtxg; objset_t *mos = dp->dp_meta_objset; static zil_header_t zero_zil __maybe_unused; objset_t *os __maybe_unused; ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock)); /* * If we are on an old pool, the zil must not be active, in which * case it will be zeroed. Usually zil_suspend() accomplishes this. */ ASSERT(spa_version(dmu_tx_pool(tx)->dp_spa) >= SPA_VERSION_FAST_SNAP || dmu_objset_from_ds(ds, &os) != 0 || memcmp(&os->os_phys->os_zil_header, &zero_zil, sizeof (zero_zil)) == 0); /* Should not snapshot a dirty dataset. */ ASSERT(!txg_list_member(&ds->ds_dir->dd_pool->dp_dirty_datasets, ds, tx->tx_txg)); dsl_fs_ss_count_adjust(ds->ds_dir, 1, DD_FIELD_SNAPSHOT_COUNT, tx); /* * The origin's ds_creation_txg has to be < TXG_INITIAL */ if (strcmp(snapname, ORIGIN_DIR_NAME) == 0) crtxg = 1; else crtxg = tx->tx_txg; dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0, DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx); VERIFY0(dmu_bonus_hold(mos, dsobj, FTAG, &dbuf)); dmu_buf_will_dirty(dbuf, tx); dsphys = dbuf->db_data; memset(dsphys, 0, sizeof (dsl_dataset_phys_t)); dsphys->ds_dir_obj = ds->ds_dir->dd_object; dsphys->ds_fsid_guid = unique_create(); (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid, sizeof (dsphys->ds_guid)); dsphys->ds_prev_snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; dsphys->ds_prev_snap_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; dsphys->ds_next_snap_obj = ds->ds_object; dsphys->ds_num_children = 1; dsphys->ds_creation_time = gethrestime_sec(); dsphys->ds_creation_txg = crtxg; dsphys->ds_deadlist_obj = dsl_dataset_phys(ds)->ds_deadlist_obj; dsphys->ds_referenced_bytes = dsl_dataset_phys(ds)->ds_referenced_bytes; dsphys->ds_compressed_bytes = dsl_dataset_phys(ds)->ds_compressed_bytes; dsphys->ds_uncompressed_bytes = dsl_dataset_phys(ds)->ds_uncompressed_bytes; dsphys->ds_flags = dsl_dataset_phys(ds)->ds_flags; rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); dsphys->ds_bp = dsl_dataset_phys(ds)->ds_bp; rrw_exit(&ds->ds_bp_rwlock, FTAG); dmu_buf_rele(dbuf, FTAG); for (spa_feature_t f = 0; f < SPA_FEATURES; f++) { if (zfeature_active(f, ds->ds_feature[f])) { dsl_dataset_activate_feature(dsobj, f, ds->ds_feature[f], tx); } } ASSERT3U(ds->ds_prev != 0, ==, dsl_dataset_phys(ds)->ds_prev_snap_obj != 0); if (ds->ds_prev) { uint64_t next_clones_obj = dsl_dataset_phys(ds->ds_prev)->ds_next_clones_obj; ASSERT(dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj == ds->ds_object || dsl_dataset_phys(ds->ds_prev)->ds_num_children > 1); if (dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj == ds->ds_object) { dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_txg, ==, dsl_dataset_phys(ds->ds_prev)->ds_creation_txg); dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj = dsobj; } else if (next_clones_obj != 0) { dsl_dataset_remove_from_next_clones(ds->ds_prev, dsphys->ds_next_snap_obj, tx); VERIFY0(zap_add_int(mos, next_clones_obj, dsobj, tx)); } } /* * If we have a reference-reservation on this dataset, we will * need to increase the amount of refreservation being charged * since our unique space is going to zero. */ if (ds->ds_reserved) { int64_t delta; ASSERT(DS_UNIQUE_IS_ACCURATE(ds)); delta = MIN(dsl_dataset_phys(ds)->ds_unique_bytes, ds->ds_reserved); dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, delta, 0, 0, tx); } dmu_buf_will_dirty(ds->ds_dbuf, tx); dsl_dataset_phys(ds)->ds_deadlist_obj = dsl_deadlist_clone(&ds->ds_deadlist, UINT64_MAX, dsl_dataset_phys(ds)->ds_prev_snap_obj, tx); dsl_deadlist_close(&ds->ds_deadlist); dsl_deadlist_open(&ds->ds_deadlist, mos, dsl_dataset_phys(ds)->ds_deadlist_obj); dsl_deadlist_add_key(&ds->ds_deadlist, dsl_dataset_phys(ds)->ds_prev_snap_txg, tx); dsl_bookmark_snapshotted(ds, tx); if (dsl_dataset_remap_deadlist_exists(ds)) { uint64_t remap_deadlist_obj = dsl_dataset_get_remap_deadlist_object(ds); /* * Move the remap_deadlist to the snapshot. The head * will create a new remap deadlist on demand, from * dsl_dataset_block_remapped(). */ dsl_dataset_unset_remap_deadlist_object(ds, tx); dsl_deadlist_close(&ds->ds_remap_deadlist); dmu_object_zapify(mos, dsobj, DMU_OT_DSL_DATASET, tx); VERIFY0(zap_add(mos, dsobj, DS_FIELD_REMAP_DEADLIST, sizeof (remap_deadlist_obj), 1, &remap_deadlist_obj, tx)); } /* * Create a ivset guid for this snapshot if the dataset is * encrypted. This may be overridden by a raw receive. A * previous implementation of this code did not have this * field as part of the on-disk format for ZFS encryption * (see errata #4). As part of the remediation for this * issue, we ask the user to enable the bookmark_v2 feature * which is now a dependency of the encryption feature. We * use this as a heuristic to determine when the user has * elected to correct any datasets created with the old code. * As a result, we only do this step if the bookmark_v2 * feature is enabled, which limits the number of states a * given pool / dataset can be in with regards to terms of * correcting the issue. */ if (ds->ds_dir->dd_crypto_obj != 0 && spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_BOOKMARK_V2)) { uint64_t ivset_guid = unique_create(); dmu_object_zapify(mos, dsobj, DMU_OT_DSL_DATASET, tx); VERIFY0(zap_add(mos, dsobj, DS_FIELD_IVSET_GUID, sizeof (ivset_guid), 1, &ivset_guid, tx)); } ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_txg, <, tx->tx_txg); dsl_dataset_phys(ds)->ds_prev_snap_obj = dsobj; dsl_dataset_phys(ds)->ds_prev_snap_txg = crtxg; dsl_dataset_phys(ds)->ds_unique_bytes = 0; if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE) dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; VERIFY0(zap_add(mos, dsl_dataset_phys(ds)->ds_snapnames_zapobj, snapname, 8, 1, &dsobj, tx)); if (ds->ds_prev) dsl_dataset_rele(ds->ds_prev, ds); VERIFY0(dsl_dataset_hold_obj(dp, dsl_dataset_phys(ds)->ds_prev_snap_obj, ds, &ds->ds_prev)); dsl_scan_ds_snapshotted(ds, tx); dsl_dir_snap_cmtime_update(ds->ds_dir, tx); if (zfs_snapshot_history_enabled) spa_history_log_internal_ds(ds->ds_prev, "snapshot", tx, " "); } void dsl_dataset_snapshot_sync(void *arg, dmu_tx_t *tx) { dsl_dataset_snapshot_arg_t *ddsa = arg; dsl_pool_t *dp = dmu_tx_pool(tx); nvpair_t *pair; for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL); pair != NULL; pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) { dsl_dataset_t *ds; char *name, *atp; char dsname[ZFS_MAX_DATASET_NAME_LEN]; name = nvpair_name(pair); atp = strchr(name, '@'); (void) strlcpy(dsname, name, atp - name + 1); VERIFY0(dsl_dataset_hold(dp, dsname, FTAG, &ds)); dsl_dataset_snapshot_sync_impl(ds, atp + 1, tx); if (ddsa->ddsa_props != NULL) { dsl_props_set_sync_impl(ds->ds_prev, ZPROP_SRC_LOCAL, ddsa->ddsa_props, tx); } dsl_dataset_rele(ds, FTAG); } } /* * The snapshots must all be in the same pool. * All-or-nothing: if there are any failures, nothing will be modified. */ int dsl_dataset_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t *errors) { dsl_dataset_snapshot_arg_t ddsa; nvpair_t *pair; boolean_t needsuspend; int error; spa_t *spa; char *firstname; nvlist_t *suspended = NULL; pair = nvlist_next_nvpair(snaps, NULL); if (pair == NULL) return (0); firstname = nvpair_name(pair); error = spa_open(firstname, &spa, FTAG); if (error != 0) return (error); needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP); spa_close(spa, FTAG); if (needsuspend) { suspended = fnvlist_alloc(); for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; pair = nvlist_next_nvpair(snaps, pair)) { char fsname[ZFS_MAX_DATASET_NAME_LEN]; char *snapname = nvpair_name(pair); char *atp; void *cookie; atp = strchr(snapname, '@'); if (atp == NULL) { error = SET_ERROR(EINVAL); break; } (void) strlcpy(fsname, snapname, atp - snapname + 1); error = zil_suspend(fsname, &cookie); if (error != 0) break; fnvlist_add_uint64(suspended, fsname, (uintptr_t)cookie); } } ddsa.ddsa_snaps = snaps; ddsa.ddsa_props = props; ddsa.ddsa_errors = errors; ddsa.ddsa_cr = CRED(); ddsa.ddsa_proc = curproc; if (error == 0) { error = dsl_sync_task(firstname, dsl_dataset_snapshot_check, dsl_dataset_snapshot_sync, &ddsa, fnvlist_num_pairs(snaps) * 3, ZFS_SPACE_CHECK_NORMAL); } if (suspended != NULL) { for (pair = nvlist_next_nvpair(suspended, NULL); pair != NULL; pair = nvlist_next_nvpair(suspended, pair)) { zil_resume((void *)(uintptr_t) fnvpair_value_uint64(pair)); } fnvlist_free(suspended); } if (error == 0) { for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; pair = nvlist_next_nvpair(snaps, pair)) { zvol_create_minor(nvpair_name(pair)); } } return (error); } typedef struct dsl_dataset_snapshot_tmp_arg { const char *ddsta_fsname; const char *ddsta_snapname; minor_t ddsta_cleanup_minor; const char *ddsta_htag; } dsl_dataset_snapshot_tmp_arg_t; static int dsl_dataset_snapshot_tmp_check(void *arg, dmu_tx_t *tx) { dsl_dataset_snapshot_tmp_arg_t *ddsta = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *ds; int error; error = dsl_dataset_hold(dp, ddsta->ddsta_fsname, FTAG, &ds); if (error != 0) return (error); /* NULL cred means no limit check for tmp snapshot */ error = dsl_dataset_snapshot_check_impl(ds, ddsta->ddsta_snapname, tx, B_FALSE, 0, NULL, NULL); if (error != 0) { dsl_dataset_rele(ds, FTAG); return (error); } if (spa_version(dp->dp_spa) < SPA_VERSION_USERREFS) { dsl_dataset_rele(ds, FTAG); return (SET_ERROR(ENOTSUP)); } error = dsl_dataset_user_hold_check_one(NULL, ddsta->ddsta_htag, B_TRUE, tx); if (error != 0) { dsl_dataset_rele(ds, FTAG); return (error); } dsl_dataset_rele(ds, FTAG); return (0); } static void dsl_dataset_snapshot_tmp_sync(void *arg, dmu_tx_t *tx) { dsl_dataset_snapshot_tmp_arg_t *ddsta = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *ds = NULL; VERIFY0(dsl_dataset_hold(dp, ddsta->ddsta_fsname, FTAG, &ds)); dsl_dataset_snapshot_sync_impl(ds, ddsta->ddsta_snapname, tx); dsl_dataset_user_hold_sync_one(ds->ds_prev, ddsta->ddsta_htag, ddsta->ddsta_cleanup_minor, gethrestime_sec(), tx); dsl_destroy_snapshot_sync_impl(ds->ds_prev, B_TRUE, tx); dsl_dataset_rele(ds, FTAG); } int dsl_dataset_snapshot_tmp(const char *fsname, const char *snapname, minor_t cleanup_minor, const char *htag) { dsl_dataset_snapshot_tmp_arg_t ddsta; int error; spa_t *spa; boolean_t needsuspend; void *cookie; ddsta.ddsta_fsname = fsname; ddsta.ddsta_snapname = snapname; ddsta.ddsta_cleanup_minor = cleanup_minor; ddsta.ddsta_htag = htag; error = spa_open(fsname, &spa, FTAG); if (error != 0) return (error); needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP); spa_close(spa, FTAG); if (needsuspend) { error = zil_suspend(fsname, &cookie); if (error != 0) return (error); } error = dsl_sync_task(fsname, dsl_dataset_snapshot_tmp_check, dsl_dataset_snapshot_tmp_sync, &ddsta, 3, ZFS_SPACE_CHECK_RESERVED); if (needsuspend) zil_resume(cookie); return (error); } void dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx) { ASSERT(dmu_tx_is_syncing(tx)); ASSERT(ds->ds_objset != NULL); ASSERT(dsl_dataset_phys(ds)->ds_next_snap_obj == 0); /* * in case we had to change ds_fsid_guid when we opened it, * sync it out now. */ dmu_buf_will_dirty(ds->ds_dbuf, tx); dsl_dataset_phys(ds)->ds_fsid_guid = ds->ds_fsid_guid; if (ds->ds_resume_bytes[tx->tx_txg & TXG_MASK] != 0) { VERIFY0(zap_update(tx->tx_pool->dp_meta_objset, ds->ds_object, DS_FIELD_RESUME_OBJECT, 8, 1, &ds->ds_resume_object[tx->tx_txg & TXG_MASK], tx)); VERIFY0(zap_update(tx->tx_pool->dp_meta_objset, ds->ds_object, DS_FIELD_RESUME_OFFSET, 8, 1, &ds->ds_resume_offset[tx->tx_txg & TXG_MASK], tx)); VERIFY0(zap_update(tx->tx_pool->dp_meta_objset, ds->ds_object, DS_FIELD_RESUME_BYTES, 8, 1, &ds->ds_resume_bytes[tx->tx_txg & TXG_MASK], tx)); ds->ds_resume_object[tx->tx_txg & TXG_MASK] = 0; ds->ds_resume_offset[tx->tx_txg & TXG_MASK] = 0; ds->ds_resume_bytes[tx->tx_txg & TXG_MASK] = 0; } dmu_objset_sync(ds->ds_objset, zio, tx); for (spa_feature_t f = 0; f < SPA_FEATURES; f++) { if (zfeature_active(f, ds->ds_feature_activation[f])) { if (zfeature_active(f, ds->ds_feature[f])) continue; dsl_dataset_activate_feature(ds->ds_object, f, ds->ds_feature_activation[f], tx); ds->ds_feature[f] = ds->ds_feature_activation[f]; } } } /* * Check if the percentage of blocks shared between the clone and the * snapshot (as opposed to those that are clone only) is below a certain * threshold */ static boolean_t dsl_livelist_should_disable(dsl_dataset_t *ds) { uint64_t used, referenced; int percent_shared; used = dsl_dir_get_usedds(ds->ds_dir); referenced = dsl_get_referenced(ds); if (referenced == 0) return (B_FALSE); percent_shared = (100 * (referenced - used)) / referenced; if (percent_shared <= zfs_livelist_min_percent_shared) return (B_TRUE); return (B_FALSE); } /* * Check if it is possible to combine two livelist entries into one. * This is the case if the combined number of 'live' blkptrs (ALLOCs that * don't have a matching FREE) is under the maximum sublist size. * We check this by subtracting twice the total number of frees from the total * number of blkptrs. FREEs are counted twice because each FREE blkptr * will cancel out an ALLOC blkptr when the livelist is processed. */ static boolean_t dsl_livelist_should_condense(dsl_deadlist_entry_t *first, dsl_deadlist_entry_t *next) { uint64_t total_free = first->dle_bpobj.bpo_phys->bpo_num_freed + next->dle_bpobj.bpo_phys->bpo_num_freed; uint64_t total_entries = first->dle_bpobj.bpo_phys->bpo_num_blkptrs + next->dle_bpobj.bpo_phys->bpo_num_blkptrs; if ((total_entries - (2 * total_free)) < zfs_livelist_max_entries) return (B_TRUE); return (B_FALSE); } typedef struct try_condense_arg { spa_t *spa; dsl_dataset_t *ds; } try_condense_arg_t; /* * Iterate over the livelist entries, searching for a pair to condense. * A nonzero return value means stop, 0 means keep looking. */ static int dsl_livelist_try_condense(void *arg, dsl_deadlist_entry_t *first) { try_condense_arg_t *tca = arg; spa_t *spa = tca->spa; dsl_dataset_t *ds = tca->ds; dsl_deadlist_t *ll = &ds->ds_dir->dd_livelist; dsl_deadlist_entry_t *next; /* The condense thread has not yet been created at import */ if (spa->spa_livelist_condense_zthr == NULL) return (1); /* A condense is already in progress */ if (spa->spa_to_condense.ds != NULL) return (1); next = AVL_NEXT(&ll->dl_tree, &first->dle_node); /* The livelist has only one entry - don't condense it */ if (next == NULL) return (1); /* Next is the newest entry - don't condense it */ if (AVL_NEXT(&ll->dl_tree, &next->dle_node) == NULL) return (1); /* This pair is not ready to condense but keep looking */ if (!dsl_livelist_should_condense(first, next)) return (0); /* * Add a ref to prevent the dataset from being evicted while * the condense zthr or synctask are running. Ref will be * released at the end of the condense synctask */ dmu_buf_add_ref(ds->ds_dbuf, spa); spa->spa_to_condense.ds = ds; spa->spa_to_condense.first = first; spa->spa_to_condense.next = next; spa->spa_to_condense.syncing = B_FALSE; spa->spa_to_condense.cancelled = B_FALSE; zthr_wakeup(spa->spa_livelist_condense_zthr); return (1); } static void dsl_flush_pending_livelist(dsl_dataset_t *ds, dmu_tx_t *tx) { dsl_dir_t *dd = ds->ds_dir; spa_t *spa = ds->ds_dir->dd_pool->dp_spa; dsl_deadlist_entry_t *last = dsl_deadlist_last(&dd->dd_livelist); /* Check if we need to add a new sub-livelist */ if (last == NULL) { /* The livelist is empty */ dsl_deadlist_add_key(&dd->dd_livelist, tx->tx_txg - 1, tx); } else if (spa_sync_pass(spa) == 1) { /* * Check if the newest entry is full. If it is, make a new one. * We only do this once per sync because we could overfill a * sublist in one sync pass and don't want to add another entry * for a txg that is already represented. This ensures that * blkptrs born in the same txg are stored in the same sublist. */ bpobj_t bpobj = last->dle_bpobj; uint64_t all = bpobj.bpo_phys->bpo_num_blkptrs; uint64_t free = bpobj.bpo_phys->bpo_num_freed; uint64_t alloc = all - free; if (alloc > zfs_livelist_max_entries) { dsl_deadlist_add_key(&dd->dd_livelist, tx->tx_txg - 1, tx); } } /* Insert each entry into the on-disk livelist */ bplist_iterate(&dd->dd_pending_allocs, dsl_deadlist_insert_alloc_cb, &dd->dd_livelist, tx); bplist_iterate(&dd->dd_pending_frees, dsl_deadlist_insert_free_cb, &dd->dd_livelist, tx); /* Attempt to condense every pair of adjacent entries */ try_condense_arg_t arg = { .spa = spa, .ds = ds }; dsl_deadlist_iterate(&dd->dd_livelist, dsl_livelist_try_condense, &arg); } void dsl_dataset_sync_done(dsl_dataset_t *ds, dmu_tx_t *tx) { objset_t *os = ds->ds_objset; bplist_iterate(&ds->ds_pending_deadlist, dsl_deadlist_insert_alloc_cb, &ds->ds_deadlist, tx); if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist)) { dsl_flush_pending_livelist(ds, tx); if (dsl_livelist_should_disable(ds)) { dsl_dir_remove_livelist(ds->ds_dir, tx, B_TRUE); } } dsl_bookmark_sync_done(ds, tx); multilist_destroy(&os->os_synced_dnodes); if (os->os_encrypted) os->os_next_write_raw[tx->tx_txg & TXG_MASK] = B_FALSE; else ASSERT0(os->os_next_write_raw[tx->tx_txg & TXG_MASK]); ASSERT(!dmu_objset_is_dirty(os, dmu_tx_get_txg(tx))); dmu_buf_rele(ds->ds_dbuf, ds); } int get_clones_stat_impl(dsl_dataset_t *ds, nvlist_t *val) { uint64_t count = 0; objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; zap_cursor_t zc; zap_attribute_t za; ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool)); /* * There may be missing entries in ds_next_clones_obj * due to a bug in a previous version of the code. * Only trust it if it has the right number of entries. */ if (dsl_dataset_phys(ds)->ds_next_clones_obj != 0) { VERIFY0(zap_count(mos, dsl_dataset_phys(ds)->ds_next_clones_obj, &count)); } if (count != dsl_dataset_phys(ds)->ds_num_children - 1) { return (SET_ERROR(ENOENT)); } for (zap_cursor_init(&zc, mos, dsl_dataset_phys(ds)->ds_next_clones_obj); zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { dsl_dataset_t *clone; char buf[ZFS_MAX_DATASET_NAME_LEN]; VERIFY0(dsl_dataset_hold_obj(ds->ds_dir->dd_pool, za.za_first_integer, FTAG, &clone)); dsl_dir_name(clone->ds_dir, buf); fnvlist_add_boolean(val, buf); dsl_dataset_rele(clone, FTAG); } zap_cursor_fini(&zc); return (0); } void get_clones_stat(dsl_dataset_t *ds, nvlist_t *nv) { nvlist_t *propval = fnvlist_alloc(); nvlist_t *val = fnvlist_alloc(); if (get_clones_stat_impl(ds, val) == 0) { fnvlist_add_nvlist(propval, ZPROP_VALUE, val); fnvlist_add_nvlist(nv, zfs_prop_to_name(ZFS_PROP_CLONES), propval); } nvlist_free(val); nvlist_free(propval); } static char * get_receive_resume_token_impl(dsl_dataset_t *ds) { if (!dsl_dataset_has_resume_receive_state(ds)) return (NULL); dsl_pool_t *dp = ds->ds_dir->dd_pool; char *str; void *packed; uint8_t *compressed; uint64_t val; nvlist_t *token_nv = fnvlist_alloc(); size_t packed_size, compressed_size; if (zap_lookup(dp->dp_meta_objset, ds->ds_object, DS_FIELD_RESUME_FROMGUID, sizeof (val), 1, &val) == 0) { fnvlist_add_uint64(token_nv, "fromguid", val); } if (zap_lookup(dp->dp_meta_objset, ds->ds_object, DS_FIELD_RESUME_OBJECT, sizeof (val), 1, &val) == 0) { fnvlist_add_uint64(token_nv, "object", val); } if (zap_lookup(dp->dp_meta_objset, ds->ds_object, DS_FIELD_RESUME_OFFSET, sizeof (val), 1, &val) == 0) { fnvlist_add_uint64(token_nv, "offset", val); } if (zap_lookup(dp->dp_meta_objset, ds->ds_object, DS_FIELD_RESUME_BYTES, sizeof (val), 1, &val) == 0) { fnvlist_add_uint64(token_nv, "bytes", val); } if (zap_lookup(dp->dp_meta_objset, ds->ds_object, DS_FIELD_RESUME_TOGUID, sizeof (val), 1, &val) == 0) { fnvlist_add_uint64(token_nv, "toguid", val); } char buf[MAXNAMELEN]; if (zap_lookup(dp->dp_meta_objset, ds->ds_object, DS_FIELD_RESUME_TONAME, 1, sizeof (buf), buf) == 0) { fnvlist_add_string(token_nv, "toname", buf); } if (zap_contains(dp->dp_meta_objset, ds->ds_object, DS_FIELD_RESUME_LARGEBLOCK) == 0) { fnvlist_add_boolean(token_nv, "largeblockok"); } if (zap_contains(dp->dp_meta_objset, ds->ds_object, DS_FIELD_RESUME_EMBEDOK) == 0) { fnvlist_add_boolean(token_nv, "embedok"); } if (zap_contains(dp->dp_meta_objset, ds->ds_object, DS_FIELD_RESUME_COMPRESSOK) == 0) { fnvlist_add_boolean(token_nv, "compressok"); } if (zap_contains(dp->dp_meta_objset, ds->ds_object, DS_FIELD_RESUME_RAWOK) == 0) { fnvlist_add_boolean(token_nv, "rawok"); } if (dsl_dataset_feature_is_active(ds, SPA_FEATURE_REDACTED_DATASETS)) { uint64_t num_redact_snaps = 0; uint64_t *redact_snaps = NULL; VERIFY3B(dsl_dataset_get_uint64_array_feature(ds, SPA_FEATURE_REDACTED_DATASETS, &num_redact_snaps, &redact_snaps), ==, B_TRUE); fnvlist_add_uint64_array(token_nv, "redact_snaps", redact_snaps, num_redact_snaps); } if (zap_contains(dp->dp_meta_objset, ds->ds_object, DS_FIELD_RESUME_REDACT_BOOKMARK_SNAPS) == 0) { uint64_t num_redact_snaps = 0, int_size = 0; uint64_t *redact_snaps = NULL; VERIFY0(zap_length(dp->dp_meta_objset, ds->ds_object, DS_FIELD_RESUME_REDACT_BOOKMARK_SNAPS, &int_size, &num_redact_snaps)); ASSERT3U(int_size, ==, sizeof (uint64_t)); redact_snaps = kmem_alloc(int_size * num_redact_snaps, KM_SLEEP); VERIFY0(zap_lookup(dp->dp_meta_objset, ds->ds_object, DS_FIELD_RESUME_REDACT_BOOKMARK_SNAPS, int_size, num_redact_snaps, redact_snaps)); fnvlist_add_uint64_array(token_nv, "book_redact_snaps", redact_snaps, num_redact_snaps); kmem_free(redact_snaps, int_size * num_redact_snaps); } packed = fnvlist_pack(token_nv, &packed_size); fnvlist_free(token_nv); compressed = kmem_alloc(packed_size, KM_SLEEP); compressed_size = gzip_compress(packed, compressed, packed_size, packed_size, 6); zio_cksum_t cksum; fletcher_4_native_varsize(compressed, compressed_size, &cksum); size_t alloc_size = compressed_size * 2 + 1; str = kmem_alloc(alloc_size, KM_SLEEP); for (int i = 0; i < compressed_size; i++) { size_t offset = i * 2; (void) snprintf(str + offset, alloc_size - offset, "%02x", compressed[i]); } str[compressed_size * 2] = '\0'; char *propval = kmem_asprintf("%u-%llx-%llx-%s", ZFS_SEND_RESUME_TOKEN_VERSION, (longlong_t)cksum.zc_word[0], (longlong_t)packed_size, str); kmem_free(packed, packed_size); kmem_free(str, alloc_size); kmem_free(compressed, packed_size); return (propval); } /* * Returns a string that represents the receive resume state token. It should * be freed with strfree(). NULL is returned if no resume state is present. */ char * get_receive_resume_token(dsl_dataset_t *ds) { /* * A failed "newfs" (e.g. full) resumable receive leaves * the stats set on this dataset. Check here for the prop. */ char *token = get_receive_resume_token_impl(ds); if (token != NULL) return (token); /* * A failed incremental resumable receive leaves the * stats set on our child named "%recv". Check the child * for the prop. */ /* 6 extra bytes for /%recv */ char name[ZFS_MAX_DATASET_NAME_LEN + 6]; dsl_dataset_t *recv_ds; dsl_dataset_name(ds, name); if (strlcat(name, "/", sizeof (name)) < sizeof (name) && strlcat(name, recv_clone_name, sizeof (name)) < sizeof (name) && dsl_dataset_hold(ds->ds_dir->dd_pool, name, FTAG, &recv_ds) == 0) { token = get_receive_resume_token_impl(recv_ds); dsl_dataset_rele(recv_ds, FTAG); } return (token); } uint64_t dsl_get_refratio(dsl_dataset_t *ds) { uint64_t ratio = dsl_dataset_phys(ds)->ds_compressed_bytes == 0 ? 100 : (dsl_dataset_phys(ds)->ds_uncompressed_bytes * 100 / dsl_dataset_phys(ds)->ds_compressed_bytes); return (ratio); } uint64_t dsl_get_logicalreferenced(dsl_dataset_t *ds) { return (dsl_dataset_phys(ds)->ds_uncompressed_bytes); } uint64_t dsl_get_compressratio(dsl_dataset_t *ds) { if (ds->ds_is_snapshot) { return (dsl_get_refratio(ds)); } else { dsl_dir_t *dd = ds->ds_dir; mutex_enter(&dd->dd_lock); uint64_t val = dsl_dir_get_compressratio(dd); mutex_exit(&dd->dd_lock); return (val); } } uint64_t dsl_get_used(dsl_dataset_t *ds) { if (ds->ds_is_snapshot) { return (dsl_dataset_phys(ds)->ds_unique_bytes); } else { dsl_dir_t *dd = ds->ds_dir; mutex_enter(&dd->dd_lock); uint64_t val = dsl_dir_get_used(dd); mutex_exit(&dd->dd_lock); return (val); } } uint64_t dsl_get_creation(dsl_dataset_t *ds) { return (dsl_dataset_phys(ds)->ds_creation_time); } uint64_t dsl_get_creationtxg(dsl_dataset_t *ds) { return (dsl_dataset_phys(ds)->ds_creation_txg); } uint64_t dsl_get_refquota(dsl_dataset_t *ds) { return (ds->ds_quota); } uint64_t dsl_get_refreservation(dsl_dataset_t *ds) { return (ds->ds_reserved); } uint64_t dsl_get_guid(dsl_dataset_t *ds) { return (dsl_dataset_phys(ds)->ds_guid); } uint64_t dsl_get_unique(dsl_dataset_t *ds) { return (dsl_dataset_phys(ds)->ds_unique_bytes); } uint64_t dsl_get_objsetid(dsl_dataset_t *ds) { return (ds->ds_object); } uint64_t dsl_get_userrefs(dsl_dataset_t *ds) { return (ds->ds_userrefs); } uint64_t dsl_get_defer_destroy(dsl_dataset_t *ds) { return (DS_IS_DEFER_DESTROY(ds) ? 1 : 0); } uint64_t dsl_get_referenced(dsl_dataset_t *ds) { return (dsl_dataset_phys(ds)->ds_referenced_bytes); } uint64_t dsl_get_numclones(dsl_dataset_t *ds) { ASSERT(ds->ds_is_snapshot); return (dsl_dataset_phys(ds)->ds_num_children - 1); } uint64_t dsl_get_inconsistent(dsl_dataset_t *ds) { return ((dsl_dataset_phys(ds)->ds_flags & DS_FLAG_INCONSISTENT) ? 1 : 0); } uint64_t dsl_get_redacted(dsl_dataset_t *ds) { return (dsl_dataset_feature_is_active(ds, SPA_FEATURE_REDACTED_DATASETS)); } uint64_t dsl_get_available(dsl_dataset_t *ds) { uint64_t refdbytes = dsl_get_referenced(ds); uint64_t availbytes = dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE); if (ds->ds_reserved > dsl_dataset_phys(ds)->ds_unique_bytes) { availbytes += ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes; } if (ds->ds_quota != 0) { /* * Adjust available bytes according to refquota */ if (refdbytes < ds->ds_quota) { availbytes = MIN(availbytes, ds->ds_quota - refdbytes); } else { availbytes = 0; } } return (availbytes); } int dsl_get_written(dsl_dataset_t *ds, uint64_t *written) { dsl_pool_t *dp = ds->ds_dir->dd_pool; dsl_dataset_t *prev; int err = dsl_dataset_hold_obj(dp, dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev); if (err == 0) { uint64_t comp, uncomp; err = dsl_dataset_space_written(prev, ds, written, &comp, &uncomp); dsl_dataset_rele(prev, FTAG); } return (err); } /* * 'snap' should be a buffer of size ZFS_MAX_DATASET_NAME_LEN. */ int dsl_get_prev_snap(dsl_dataset_t *ds, char *snap) { dsl_pool_t *dp = ds->ds_dir->dd_pool; if (ds->ds_prev != NULL && ds->ds_prev != dp->dp_origin_snap) { dsl_dataset_name(ds->ds_prev, snap); return (0); } else { return (SET_ERROR(ENOENT)); } } void dsl_get_redact_snaps(dsl_dataset_t *ds, nvlist_t *propval) { uint64_t nsnaps; uint64_t *snaps; if (dsl_dataset_get_uint64_array_feature(ds, SPA_FEATURE_REDACTED_DATASETS, &nsnaps, &snaps)) { fnvlist_add_uint64_array(propval, ZPROP_VALUE, snaps, nsnaps); } } /* * Returns the mountpoint property and source for the given dataset in the value * and source buffers. The value buffer must be at least as large as MAXPATHLEN * and the source buffer as least as large a ZFS_MAX_DATASET_NAME_LEN. * Returns 0 on success and an error on failure. */ int dsl_get_mountpoint(dsl_dataset_t *ds, const char *dsname, char *value, char *source) { int error; dsl_pool_t *dp = ds->ds_dir->dd_pool; /* Retrieve the mountpoint value stored in the zap object */ error = dsl_prop_get_ds(ds, zfs_prop_to_name(ZFS_PROP_MOUNTPOINT), 1, ZAP_MAXVALUELEN, value, source); if (error != 0) { return (error); } /* * Process the dsname and source to find the full mountpoint string. * Can be skipped for 'legacy' or 'none'. */ if (value[0] == '/') { char *buf = kmem_alloc(ZAP_MAXVALUELEN, KM_SLEEP); char *root = buf; const char *relpath; /* * If we inherit the mountpoint, even from a dataset * with a received value, the source will be the path of * the dataset we inherit from. If source is * ZPROP_SOURCE_VAL_RECVD, the received value is not * inherited. */ if (strcmp(source, ZPROP_SOURCE_VAL_RECVD) == 0) { relpath = ""; } else { ASSERT0(strncmp(dsname, source, strlen(source))); relpath = dsname + strlen(source); if (relpath[0] == '/') relpath++; } spa_altroot(dp->dp_spa, root, ZAP_MAXVALUELEN); /* * Special case an alternate root of '/'. This will * avoid having multiple leading slashes in the * mountpoint path. */ if (strcmp(root, "/") == 0) root++; /* * If the mountpoint is '/' then skip over this * if we are obtaining either an alternate root or * an inherited mountpoint. */ char *mnt = value; if (value[1] == '\0' && (root[0] != '\0' || relpath[0] != '\0')) mnt = value + 1; mnt = kmem_strdup(mnt); if (relpath[0] == '\0') { (void) snprintf(value, ZAP_MAXVALUELEN, "%s%s", root, mnt); } else { (void) snprintf(value, ZAP_MAXVALUELEN, "%s%s%s%s", root, mnt, relpath[0] == '@' ? "" : "/", relpath); } kmem_free(buf, ZAP_MAXVALUELEN); kmem_strfree(mnt); } return (0); } void dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv) { dsl_pool_t *dp __maybe_unused = ds->ds_dir->dd_pool; ASSERT(dsl_pool_config_held(dp)); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRATIO, dsl_get_refratio(ds)); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_LOGICALREFERENCED, dsl_get_logicalreferenced(ds)); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO, dsl_get_compressratio(ds)); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED, dsl_get_used(ds)); if (ds->ds_is_snapshot) { get_clones_stat(ds, nv); } else { char buf[ZFS_MAX_DATASET_NAME_LEN]; if (dsl_get_prev_snap(ds, buf) == 0) dsl_prop_nvlist_add_string(nv, ZFS_PROP_PREV_SNAP, buf); dsl_dir_stats(ds->ds_dir, nv); } nvlist_t *propval = fnvlist_alloc(); dsl_get_redact_snaps(ds, propval); fnvlist_add_nvlist(nv, zfs_prop_to_name(ZFS_PROP_REDACT_SNAPS), propval); nvlist_free(propval); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_AVAILABLE, dsl_get_available(ds)); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFERENCED, dsl_get_referenced(ds)); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATION, dsl_get_creation(ds)); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATETXG, dsl_get_creationtxg(ds)); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFQUOTA, dsl_get_refquota(ds)); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRESERVATION, dsl_get_refreservation(ds)); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_GUID, dsl_get_guid(ds)); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_UNIQUE, dsl_get_unique(ds)); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_OBJSETID, dsl_get_objsetid(ds)); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERREFS, dsl_get_userrefs(ds)); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_DEFER_DESTROY, dsl_get_defer_destroy(ds)); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_SNAPSHOTS_CHANGED, dsl_dir_snap_cmtime(ds->ds_dir).tv_sec); dsl_dataset_crypt_stats(ds, nv); if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) { uint64_t written; if (dsl_get_written(ds, &written) == 0) { dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_WRITTEN, written); } } if (!dsl_dataset_is_snapshot(ds)) { char *token = get_receive_resume_token(ds); if (token != NULL) { dsl_prop_nvlist_add_string(nv, ZFS_PROP_RECEIVE_RESUME_TOKEN, token); kmem_strfree(token); } } } void dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat) { dsl_pool_t *dp __maybe_unused = ds->ds_dir->dd_pool; ASSERT(dsl_pool_config_held(dp)); stat->dds_creation_txg = dsl_get_creationtxg(ds); stat->dds_inconsistent = dsl_get_inconsistent(ds); stat->dds_guid = dsl_get_guid(ds); stat->dds_redacted = dsl_get_redacted(ds); stat->dds_origin[0] = '\0'; if (ds->ds_is_snapshot) { stat->dds_is_snapshot = B_TRUE; stat->dds_num_clones = dsl_get_numclones(ds); } else { stat->dds_is_snapshot = B_FALSE; stat->dds_num_clones = 0; if (dsl_dir_is_clone(ds->ds_dir)) { dsl_dir_get_origin(ds->ds_dir, stat->dds_origin); } } } uint64_t dsl_dataset_fsid_guid(dsl_dataset_t *ds) { return (ds->ds_fsid_guid); } void dsl_dataset_space(dsl_dataset_t *ds, uint64_t *refdbytesp, uint64_t *availbytesp, uint64_t *usedobjsp, uint64_t *availobjsp) { *refdbytesp = dsl_dataset_phys(ds)->ds_referenced_bytes; *availbytesp = dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE); if (ds->ds_reserved > dsl_dataset_phys(ds)->ds_unique_bytes) *availbytesp += ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes; if (ds->ds_quota != 0) { /* * Adjust available bytes according to refquota */ if (*refdbytesp < ds->ds_quota) *availbytesp = MIN(*availbytesp, ds->ds_quota - *refdbytesp); else *availbytesp = 0; } rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); *usedobjsp = BP_GET_FILL(&dsl_dataset_phys(ds)->ds_bp); rrw_exit(&ds->ds_bp_rwlock, FTAG); *availobjsp = DN_MAX_OBJECT - *usedobjsp; } boolean_t dsl_dataset_modified_since_snap(dsl_dataset_t *ds, dsl_dataset_t *snap) { dsl_pool_t *dp __maybe_unused = ds->ds_dir->dd_pool; uint64_t birth; ASSERT(dsl_pool_config_held(dp)); if (snap == NULL) return (B_FALSE); rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); birth = dsl_dataset_get_blkptr(ds)->blk_birth; rrw_exit(&ds->ds_bp_rwlock, FTAG); if (birth > dsl_dataset_phys(snap)->ds_creation_txg) { objset_t *os, *os_snap; /* * It may be that only the ZIL differs, because it was * reset in the head. Don't count that as being * modified. */ if (dmu_objset_from_ds(ds, &os) != 0) return (B_TRUE); if (dmu_objset_from_ds(snap, &os_snap) != 0) return (B_TRUE); return (memcmp(&os->os_phys->os_meta_dnode, &os_snap->os_phys->os_meta_dnode, sizeof (os->os_phys->os_meta_dnode)) != 0); } return (B_FALSE); } static int dsl_dataset_rename_snapshot_check_impl(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) { (void) dp; dsl_dataset_rename_snapshot_arg_t *ddrsa = arg; int error; uint64_t val; error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_oldsnapname, &val); if (error != 0) { /* ignore nonexistent snapshots */ return (error == ENOENT ? 0 : error); } /* new name should not exist */ error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_newsnapname, &val); if (error == 0) error = SET_ERROR(EEXIST); else if (error == ENOENT) error = 0; /* dataset name + 1 for the "@" + the new snapshot name must fit */ if (dsl_dir_namelen(hds->ds_dir) + 1 + strlen(ddrsa->ddrsa_newsnapname) >= ZFS_MAX_DATASET_NAME_LEN) error = SET_ERROR(ENAMETOOLONG); return (error); } int dsl_dataset_rename_snapshot_check(void *arg, dmu_tx_t *tx) { dsl_dataset_rename_snapshot_arg_t *ddrsa = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *hds; int error; error = dsl_dataset_hold(dp, ddrsa->ddrsa_fsname, FTAG, &hds); if (error != 0) return (error); if (ddrsa->ddrsa_recursive) { error = dmu_objset_find_dp(dp, hds->ds_dir->dd_object, dsl_dataset_rename_snapshot_check_impl, ddrsa, DS_FIND_CHILDREN); } else { error = dsl_dataset_rename_snapshot_check_impl(dp, hds, ddrsa); } dsl_dataset_rele(hds, FTAG); return (error); } static int dsl_dataset_rename_snapshot_sync_impl(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) { dsl_dataset_rename_snapshot_arg_t *ddrsa = arg; dsl_dataset_t *ds; uint64_t val; dmu_tx_t *tx = ddrsa->ddrsa_tx; int error; error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_oldsnapname, &val); ASSERT(error == 0 || error == ENOENT); if (error == ENOENT) { /* ignore nonexistent snapshots */ return (0); } VERIFY0(dsl_dataset_hold_obj(dp, val, FTAG, &ds)); /* log before we change the name */ spa_history_log_internal_ds(ds, "rename", tx, "-> @%s", ddrsa->ddrsa_newsnapname); VERIFY0(dsl_dataset_snap_remove(hds, ddrsa->ddrsa_oldsnapname, tx, B_FALSE)); mutex_enter(&ds->ds_lock); (void) strlcpy(ds->ds_snapname, ddrsa->ddrsa_newsnapname, sizeof (ds->ds_snapname)); mutex_exit(&ds->ds_lock); VERIFY0(zap_add(dp->dp_meta_objset, dsl_dataset_phys(hds)->ds_snapnames_zapobj, ds->ds_snapname, 8, 1, &ds->ds_object, tx)); zvol_rename_minors(dp->dp_spa, ddrsa->ddrsa_oldsnapname, ddrsa->ddrsa_newsnapname, B_TRUE); dsl_dataset_rele(ds, FTAG); return (0); } void dsl_dataset_rename_snapshot_sync(void *arg, dmu_tx_t *tx) { dsl_dataset_rename_snapshot_arg_t *ddrsa = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *hds = NULL; VERIFY0(dsl_dataset_hold(dp, ddrsa->ddrsa_fsname, FTAG, &hds)); ddrsa->ddrsa_tx = tx; if (ddrsa->ddrsa_recursive) { VERIFY0(dmu_objset_find_dp(dp, hds->ds_dir->dd_object, dsl_dataset_rename_snapshot_sync_impl, ddrsa, DS_FIND_CHILDREN)); } else { VERIFY0(dsl_dataset_rename_snapshot_sync_impl(dp, hds, ddrsa)); } dsl_dataset_rele(hds, FTAG); } int dsl_dataset_rename_snapshot(const char *fsname, const char *oldsnapname, const char *newsnapname, boolean_t recursive) { dsl_dataset_rename_snapshot_arg_t ddrsa; ddrsa.ddrsa_fsname = fsname; ddrsa.ddrsa_oldsnapname = oldsnapname; ddrsa.ddrsa_newsnapname = newsnapname; ddrsa.ddrsa_recursive = recursive; return (dsl_sync_task(fsname, dsl_dataset_rename_snapshot_check, dsl_dataset_rename_snapshot_sync, &ddrsa, 1, ZFS_SPACE_CHECK_RESERVED)); } /* * If we're doing an ownership handoff, we need to make sure that there is * only one long hold on the dataset. We're not allowed to change anything here * so we don't permanently release the long hold or regular hold here. We want * to do this only when syncing to avoid the dataset unexpectedly going away * when we release the long hold. */ static int dsl_dataset_handoff_check(dsl_dataset_t *ds, void *owner, dmu_tx_t *tx) { boolean_t held = B_FALSE; if (!dmu_tx_is_syncing(tx)) return (0); dsl_dir_t *dd = ds->ds_dir; mutex_enter(&dd->dd_activity_lock); uint64_t holds = zfs_refcount_count(&ds->ds_longholds) - (owner != NULL ? 1 : 0); /* * The value of dd_activity_waiters can chance as soon as we drop the * lock, but we're fine with that; new waiters coming in or old * waiters leaving doesn't cause problems, since we're going to cancel * waiters later anyway. The goal of this check is to verify that no * non-waiters have long-holds, and all new long-holds will be * prevented because we're holding the pool config as writer. */ if (holds != dd->dd_activity_waiters) held = B_TRUE; mutex_exit(&dd->dd_activity_lock); if (held) return (SET_ERROR(EBUSY)); return (0); } int dsl_dataset_rollback_check(void *arg, dmu_tx_t *tx) { dsl_dataset_rollback_arg_t *ddra = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *ds; int64_t unused_refres_delta; int error; error = dsl_dataset_hold(dp, ddra->ddra_fsname, FTAG, &ds); if (error != 0) return (error); /* must not be a snapshot */ if (ds->ds_is_snapshot) { dsl_dataset_rele(ds, FTAG); return (SET_ERROR(EINVAL)); } /* must have a most recent snapshot */ if (dsl_dataset_phys(ds)->ds_prev_snap_txg < TXG_INITIAL) { dsl_dataset_rele(ds, FTAG); return (SET_ERROR(ESRCH)); } /* * No rollback to a snapshot created in the current txg, because * the rollback may dirty the dataset and create blocks that are * not reachable from the rootbp while having a birth txg that * falls into the snapshot's range. */ if (dmu_tx_is_syncing(tx) && dsl_dataset_phys(ds)->ds_prev_snap_txg >= tx->tx_txg) { dsl_dataset_rele(ds, FTAG); return (SET_ERROR(EAGAIN)); } /* * If the expected target snapshot is specified, then check that * the latest snapshot is it. */ if (ddra->ddra_tosnap != NULL) { dsl_dataset_t *snapds; /* Check if the target snapshot exists at all. */ error = dsl_dataset_hold(dp, ddra->ddra_tosnap, FTAG, &snapds); if (error != 0) { /* * ESRCH is used to signal that the target snapshot does * not exist, while ENOENT is used to report that * the rolled back dataset does not exist. * ESRCH is also used to cover other cases where the * target snapshot is not related to the dataset being * rolled back such as being in a different pool. */ if (error == ENOENT || error == EXDEV) error = SET_ERROR(ESRCH); dsl_dataset_rele(ds, FTAG); return (error); } ASSERT(snapds->ds_is_snapshot); /* Check if the snapshot is the latest snapshot indeed. */ if (snapds != ds->ds_prev) { /* * Distinguish between the case where the only problem * is intervening snapshots (EEXIST) vs the snapshot * not being a valid target for rollback (ESRCH). */ if (snapds->ds_dir == ds->ds_dir || (dsl_dir_is_clone(ds->ds_dir) && dsl_dir_phys(ds->ds_dir)->dd_origin_obj == snapds->ds_object)) { error = SET_ERROR(EEXIST); } else { error = SET_ERROR(ESRCH); } dsl_dataset_rele(snapds, FTAG); dsl_dataset_rele(ds, FTAG); return (error); } dsl_dataset_rele(snapds, FTAG); } /* must not have any bookmarks after the most recent snapshot */ if (dsl_bookmark_latest_txg(ds) > dsl_dataset_phys(ds)->ds_prev_snap_txg) { dsl_dataset_rele(ds, FTAG); return (SET_ERROR(EEXIST)); } error = dsl_dataset_handoff_check(ds, ddra->ddra_owner, tx); if (error != 0) { dsl_dataset_rele(ds, FTAG); return (error); } /* * Check if the snap we are rolling back to uses more than * the refquota. */ if (ds->ds_quota != 0 && dsl_dataset_phys(ds->ds_prev)->ds_referenced_bytes > ds->ds_quota) { dsl_dataset_rele(ds, FTAG); return (SET_ERROR(EDQUOT)); } /* * When we do the clone swap, we will temporarily use more space * due to the refreservation (the head will no longer have any * unique space, so the entire amount of the refreservation will need * to be free). We will immediately destroy the clone, freeing * this space, but the freeing happens over many txg's. */ unused_refres_delta = (int64_t)MIN(ds->ds_reserved, dsl_dataset_phys(ds)->ds_unique_bytes); if (unused_refres_delta > 0 && unused_refres_delta > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE)) { dsl_dataset_rele(ds, FTAG); return (SET_ERROR(ENOSPC)); } dsl_dataset_rele(ds, FTAG); return (0); } void dsl_dataset_rollback_sync(void *arg, dmu_tx_t *tx) { dsl_dataset_rollback_arg_t *ddra = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *ds, *clone; uint64_t cloneobj; char namebuf[ZFS_MAX_DATASET_NAME_LEN]; VERIFY0(dsl_dataset_hold(dp, ddra->ddra_fsname, FTAG, &ds)); dsl_dataset_name(ds->ds_prev, namebuf); fnvlist_add_string(ddra->ddra_result, "target", namebuf); cloneobj = dsl_dataset_create_sync(ds->ds_dir, "%rollback", ds->ds_prev, DS_CREATE_FLAG_NODIRTY, kcred, NULL, tx); VERIFY0(dsl_dataset_hold_obj(dp, cloneobj, FTAG, &clone)); dsl_dataset_clone_swap_sync_impl(clone, ds, tx); dsl_dataset_zero_zil(ds, tx); dsl_destroy_head_sync_impl(clone, tx); dsl_dataset_rele(clone, FTAG); dsl_dataset_rele(ds, FTAG); } /* * Rolls back the given filesystem or volume to the most recent snapshot. * The name of the most recent snapshot will be returned under key "target" * in the result nvlist. * * If owner != NULL: * - The existing dataset MUST be owned by the specified owner at entry * - Upon return, dataset will still be held by the same owner, whether we * succeed or not. * * This mode is required any time the existing filesystem is mounted. See * notes above zfs_suspend_fs() for further details. */ int dsl_dataset_rollback(const char *fsname, const char *tosnap, void *owner, nvlist_t *result) { dsl_dataset_rollback_arg_t ddra; ddra.ddra_fsname = fsname; ddra.ddra_tosnap = tosnap; ddra.ddra_owner = owner; ddra.ddra_result = result; return (dsl_sync_task(fsname, dsl_dataset_rollback_check, dsl_dataset_rollback_sync, &ddra, 1, ZFS_SPACE_CHECK_RESERVED)); } struct promotenode { list_node_t link; dsl_dataset_t *ds; }; static int snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep); static int promote_hold(dsl_dataset_promote_arg_t *ddpa, dsl_pool_t *dp, const void *tag); static void promote_rele(dsl_dataset_promote_arg_t *ddpa, const void *tag); int dsl_dataset_promote_check(void *arg, dmu_tx_t *tx) { dsl_dataset_promote_arg_t *ddpa = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *hds; struct promotenode *snap; int err; uint64_t unused; uint64_t ss_mv_cnt; size_t max_snap_len; boolean_t conflicting_snaps; err = promote_hold(ddpa, dp, FTAG); if (err != 0) return (err); hds = ddpa->ddpa_clone; max_snap_len = MAXNAMELEN - strlen(ddpa->ddpa_clonename) - 1; if (dsl_dataset_phys(hds)->ds_flags & DS_FLAG_NOPROMOTE) { promote_rele(ddpa, FTAG); return (SET_ERROR(EXDEV)); } snap = list_head(&ddpa->shared_snaps); if (snap == NULL) { err = SET_ERROR(ENOENT); goto out; } dsl_dataset_t *const origin_ds = snap->ds; /* * Encrypted clones share a DSL Crypto Key with their origin's dsl dir. * When doing a promote we must make sure the encryption root for * both the target and the target's origin does not change to avoid * needing to rewrap encryption keys */ err = dsl_dataset_promote_crypt_check(hds->ds_dir, origin_ds->ds_dir); if (err != 0) goto out; /* * Compute and check the amount of space to transfer. Since this is * so expensive, don't do the preliminary check. */ if (!dmu_tx_is_syncing(tx)) { promote_rele(ddpa, FTAG); return (0); } /* compute origin's new unique space */ snap = list_tail(&ddpa->clone_snaps); ASSERT(snap != NULL); ASSERT3U(dsl_dataset_phys(snap->ds)->ds_prev_snap_obj, ==, origin_ds->ds_object); dsl_deadlist_space_range(&snap->ds->ds_deadlist, dsl_dataset_phys(origin_ds)->ds_prev_snap_txg, UINT64_MAX, &ddpa->unique, &unused, &unused); /* * Walk the snapshots that we are moving * * Compute space to transfer. Consider the incremental changes * to used by each snapshot: * (my used) = (prev's used) + (blocks born) - (blocks killed) * So each snapshot gave birth to: * (blocks born) = (my used) - (prev's used) + (blocks killed) * So a sequence would look like: * (uN - u(N-1) + kN) + ... + (u1 - u0 + k1) + (u0 - 0 + k0) * Which simplifies to: * uN + kN + kN-1 + ... + k1 + k0 * Note however, if we stop before we reach the ORIGIN we get: * uN + kN + kN-1 + ... + kM - uM-1 */ conflicting_snaps = B_FALSE; ss_mv_cnt = 0; ddpa->used = dsl_dataset_phys(origin_ds)->ds_referenced_bytes; ddpa->comp = dsl_dataset_phys(origin_ds)->ds_compressed_bytes; ddpa->uncomp = dsl_dataset_phys(origin_ds)->ds_uncompressed_bytes; for (snap = list_head(&ddpa->shared_snaps); snap; snap = list_next(&ddpa->shared_snaps, snap)) { uint64_t val, dlused, dlcomp, dluncomp; dsl_dataset_t *ds = snap->ds; ss_mv_cnt++; /* * If there are long holds, we won't be able to evict * the objset. */ if (dsl_dataset_long_held(ds)) { err = SET_ERROR(EBUSY); goto out; } /* Check that the snapshot name does not conflict */ VERIFY0(dsl_dataset_get_snapname(ds)); if (strlen(ds->ds_snapname) >= max_snap_len) { err = SET_ERROR(ENAMETOOLONG); goto out; } err = dsl_dataset_snap_lookup(hds, ds->ds_snapname, &val); if (err == 0) { fnvlist_add_boolean(ddpa->err_ds, snap->ds->ds_snapname); conflicting_snaps = B_TRUE; } else if (err != ENOENT) { goto out; } /* The very first snapshot does not have a deadlist */ if (dsl_dataset_phys(ds)->ds_prev_snap_obj == 0) continue; dsl_deadlist_space(&ds->ds_deadlist, &dlused, &dlcomp, &dluncomp); ddpa->used += dlused; ddpa->comp += dlcomp; ddpa->uncomp += dluncomp; } /* * Check that bookmarks that are being transferred don't have * name conflicts. */ for (dsl_bookmark_node_t *dbn = avl_first(&origin_ds->ds_bookmarks); dbn != NULL && dbn->dbn_phys.zbm_creation_txg <= dsl_dataset_phys(origin_ds)->ds_creation_txg; dbn = AVL_NEXT(&origin_ds->ds_bookmarks, dbn)) { if (strlen(dbn->dbn_name) >= max_snap_len) { err = SET_ERROR(ENAMETOOLONG); goto out; } zfs_bookmark_phys_t bm; err = dsl_bookmark_lookup_impl(ddpa->ddpa_clone, dbn->dbn_name, &bm); if (err == 0) { fnvlist_add_boolean(ddpa->err_ds, dbn->dbn_name); conflicting_snaps = B_TRUE; } else if (err == ESRCH) { err = 0; - } else if (err != 0) { + } + if (err != 0) { goto out; } } /* * In order to return the full list of conflicting snapshots, we check * whether there was a conflict after traversing all of them. */ if (conflicting_snaps) { err = SET_ERROR(EEXIST); goto out; } /* * If we are a clone of a clone then we never reached ORIGIN, * so we need to subtract out the clone origin's used space. */ if (ddpa->origin_origin) { ddpa->used -= dsl_dataset_phys(ddpa->origin_origin)->ds_referenced_bytes; ddpa->comp -= dsl_dataset_phys(ddpa->origin_origin)->ds_compressed_bytes; ddpa->uncomp -= dsl_dataset_phys(ddpa->origin_origin)-> ds_uncompressed_bytes; } /* Check that there is enough space and limit headroom here */ err = dsl_dir_transfer_possible(origin_ds->ds_dir, hds->ds_dir, 0, ss_mv_cnt, ddpa->used, ddpa->cr, ddpa->proc); if (err != 0) goto out; /* * Compute the amounts of space that will be used by snapshots * after the promotion (for both origin and clone). For each, * it is the amount of space that will be on all of their * deadlists (that was not born before their new origin). */ if (dsl_dir_phys(hds->ds_dir)->dd_flags & DD_FLAG_USED_BREAKDOWN) { uint64_t space; /* * Note, typically this will not be a clone of a clone, * so dd_origin_txg will be < TXG_INITIAL, so * these snaplist_space() -> dsl_deadlist_space_range() * calls will be fast because they do not have to * iterate over all bps. */ snap = list_head(&ddpa->origin_snaps); if (snap == NULL) { err = SET_ERROR(ENOENT); goto out; } err = snaplist_space(&ddpa->shared_snaps, snap->ds->ds_dir->dd_origin_txg, &ddpa->cloneusedsnap); if (err != 0) goto out; err = snaplist_space(&ddpa->clone_snaps, snap->ds->ds_dir->dd_origin_txg, &space); if (err != 0) goto out; ddpa->cloneusedsnap += space; } if (dsl_dir_phys(origin_ds->ds_dir)->dd_flags & DD_FLAG_USED_BREAKDOWN) { err = snaplist_space(&ddpa->origin_snaps, dsl_dataset_phys(origin_ds)->ds_creation_txg, &ddpa->originusedsnap); if (err != 0) goto out; } out: promote_rele(ddpa, FTAG); return (err); } void dsl_dataset_promote_sync(void *arg, dmu_tx_t *tx) { dsl_dataset_promote_arg_t *ddpa = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *hds; struct promotenode *snap; dsl_dataset_t *origin_ds; dsl_dataset_t *origin_head; dsl_dir_t *dd; dsl_dir_t *odd = NULL; uint64_t oldnext_obj; int64_t delta; ASSERT(nvlist_empty(ddpa->err_ds)); VERIFY0(promote_hold(ddpa, dp, FTAG)); hds = ddpa->ddpa_clone; ASSERT0(dsl_dataset_phys(hds)->ds_flags & DS_FLAG_NOPROMOTE); snap = list_head(&ddpa->shared_snaps); origin_ds = snap->ds; dd = hds->ds_dir; snap = list_head(&ddpa->origin_snaps); origin_head = snap->ds; /* * We need to explicitly open odd, since origin_ds's dd will be * changing. */ VERIFY0(dsl_dir_hold_obj(dp, origin_ds->ds_dir->dd_object, NULL, FTAG, &odd)); dsl_dataset_promote_crypt_sync(hds->ds_dir, odd, tx); /* change origin's next snap */ dmu_buf_will_dirty(origin_ds->ds_dbuf, tx); oldnext_obj = dsl_dataset_phys(origin_ds)->ds_next_snap_obj; snap = list_tail(&ddpa->clone_snaps); ASSERT3U(dsl_dataset_phys(snap->ds)->ds_prev_snap_obj, ==, origin_ds->ds_object); dsl_dataset_phys(origin_ds)->ds_next_snap_obj = snap->ds->ds_object; /* change the origin's next clone */ if (dsl_dataset_phys(origin_ds)->ds_next_clones_obj) { dsl_dataset_remove_from_next_clones(origin_ds, snap->ds->ds_object, tx); VERIFY0(zap_add_int(dp->dp_meta_objset, dsl_dataset_phys(origin_ds)->ds_next_clones_obj, oldnext_obj, tx)); } /* change origin */ dmu_buf_will_dirty(dd->dd_dbuf, tx); ASSERT3U(dsl_dir_phys(dd)->dd_origin_obj, ==, origin_ds->ds_object); dsl_dir_phys(dd)->dd_origin_obj = dsl_dir_phys(odd)->dd_origin_obj; dd->dd_origin_txg = origin_head->ds_dir->dd_origin_txg; dmu_buf_will_dirty(odd->dd_dbuf, tx); dsl_dir_phys(odd)->dd_origin_obj = origin_ds->ds_object; origin_head->ds_dir->dd_origin_txg = dsl_dataset_phys(origin_ds)->ds_creation_txg; /* change dd_clone entries */ if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { VERIFY0(zap_remove_int(dp->dp_meta_objset, dsl_dir_phys(odd)->dd_clones, hds->ds_object, tx)); VERIFY0(zap_add_int(dp->dp_meta_objset, dsl_dir_phys(ddpa->origin_origin->ds_dir)->dd_clones, hds->ds_object, tx)); VERIFY0(zap_remove_int(dp->dp_meta_objset, dsl_dir_phys(ddpa->origin_origin->ds_dir)->dd_clones, origin_head->ds_object, tx)); if (dsl_dir_phys(dd)->dd_clones == 0) { dsl_dir_phys(dd)->dd_clones = zap_create(dp->dp_meta_objset, DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx); } VERIFY0(zap_add_int(dp->dp_meta_objset, dsl_dir_phys(dd)->dd_clones, origin_head->ds_object, tx)); } /* * Move bookmarks to this dir. */ dsl_bookmark_node_t *dbn_next; for (dsl_bookmark_node_t *dbn = avl_first(&origin_head->ds_bookmarks); dbn != NULL && dbn->dbn_phys.zbm_creation_txg <= dsl_dataset_phys(origin_ds)->ds_creation_txg; dbn = dbn_next) { dbn_next = AVL_NEXT(&origin_head->ds_bookmarks, dbn); avl_remove(&origin_head->ds_bookmarks, dbn); VERIFY0(zap_remove(dp->dp_meta_objset, origin_head->ds_bookmarks_obj, dbn->dbn_name, tx)); dsl_bookmark_node_add(hds, dbn, tx); } dsl_bookmark_next_changed(hds, origin_ds, tx); /* move snapshots to this dir */ for (snap = list_head(&ddpa->shared_snaps); snap; snap = list_next(&ddpa->shared_snaps, snap)) { dsl_dataset_t *ds = snap->ds; /* * Property callbacks are registered to a particular * dsl_dir. Since ours is changing, evict the objset * so that they will be unregistered from the old dsl_dir. */ if (ds->ds_objset) { dmu_objset_evict(ds->ds_objset); ds->ds_objset = NULL; } /* move snap name entry */ VERIFY0(dsl_dataset_get_snapname(ds)); VERIFY0(dsl_dataset_snap_remove(origin_head, ds->ds_snapname, tx, B_TRUE)); VERIFY0(zap_add(dp->dp_meta_objset, dsl_dataset_phys(hds)->ds_snapnames_zapobj, ds->ds_snapname, 8, 1, &ds->ds_object, tx)); dsl_fs_ss_count_adjust(hds->ds_dir, 1, DD_FIELD_SNAPSHOT_COUNT, tx); /* change containing dsl_dir */ dmu_buf_will_dirty(ds->ds_dbuf, tx); ASSERT3U(dsl_dataset_phys(ds)->ds_dir_obj, ==, odd->dd_object); dsl_dataset_phys(ds)->ds_dir_obj = dd->dd_object; ASSERT3P(ds->ds_dir, ==, odd); dsl_dir_rele(ds->ds_dir, ds); VERIFY0(dsl_dir_hold_obj(dp, dd->dd_object, NULL, ds, &ds->ds_dir)); /* move any clone references */ if (dsl_dataset_phys(ds)->ds_next_clones_obj && spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { zap_cursor_t zc; zap_attribute_t za; for (zap_cursor_init(&zc, dp->dp_meta_objset, dsl_dataset_phys(ds)->ds_next_clones_obj); zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { dsl_dataset_t *cnds; uint64_t o; if (za.za_first_integer == oldnext_obj) { /* * We've already moved the * origin's reference. */ continue; } VERIFY0(dsl_dataset_hold_obj(dp, za.za_first_integer, FTAG, &cnds)); o = dsl_dir_phys(cnds->ds_dir)-> dd_head_dataset_obj; VERIFY0(zap_remove_int(dp->dp_meta_objset, dsl_dir_phys(odd)->dd_clones, o, tx)); VERIFY0(zap_add_int(dp->dp_meta_objset, dsl_dir_phys(dd)->dd_clones, o, tx)); dsl_dataset_rele(cnds, FTAG); } zap_cursor_fini(&zc); } ASSERT(!dsl_prop_hascb(ds)); } /* * Change space accounting. * Note, pa->*usedsnap and dd_used_breakdown[SNAP] will either * both be valid, or both be 0 (resulting in delta == 0). This * is true for each of {clone,origin} independently. */ delta = ddpa->cloneusedsnap - dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_SNAP]; ASSERT3S(delta, >=, 0); ASSERT3U(ddpa->used, >=, delta); dsl_dir_diduse_space(dd, DD_USED_SNAP, delta, 0, 0, tx); dsl_dir_diduse_space(dd, DD_USED_HEAD, ddpa->used - delta, ddpa->comp, ddpa->uncomp, tx); delta = ddpa->originusedsnap - dsl_dir_phys(odd)->dd_used_breakdown[DD_USED_SNAP]; ASSERT3S(delta, <=, 0); ASSERT3U(ddpa->used, >=, -delta); dsl_dir_diduse_space(odd, DD_USED_SNAP, delta, 0, 0, tx); dsl_dir_diduse_space(odd, DD_USED_HEAD, -ddpa->used - delta, -ddpa->comp, -ddpa->uncomp, tx); dsl_dataset_phys(origin_ds)->ds_unique_bytes = ddpa->unique; /* * Since livelists are specific to a clone's origin txg, they * are no longer accurate. Destroy the livelist from the clone being * promoted. If the origin dataset is a clone, destroy its livelist * as well. */ dsl_dir_remove_livelist(dd, tx, B_TRUE); dsl_dir_remove_livelist(odd, tx, B_TRUE); /* log history record */ spa_history_log_internal_ds(hds, "promote", tx, " "); dsl_dir_rele(odd, FTAG); promote_rele(ddpa, FTAG); /* * Transfer common error blocks from old head to new head. */ if (spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_HEAD_ERRLOG)) { uint64_t old_head = origin_head->ds_object; uint64_t new_head = hds->ds_object; spa_swap_errlog(dp->dp_spa, new_head, old_head, tx); } } /* * Make a list of dsl_dataset_t's for the snapshots between first_obj * (exclusive) and last_obj (inclusive). The list will be in reverse * order (last_obj will be the list_head()). If first_obj == 0, do all * snapshots back to this dataset's origin. */ static int snaplist_make(dsl_pool_t *dp, uint64_t first_obj, uint64_t last_obj, list_t *l, const void *tag) { uint64_t obj = last_obj; list_create(l, sizeof (struct promotenode), offsetof(struct promotenode, link)); while (obj != first_obj) { dsl_dataset_t *ds; struct promotenode *snap; int err; err = dsl_dataset_hold_obj(dp, obj, tag, &ds); ASSERT(err != ENOENT); if (err != 0) return (err); if (first_obj == 0) first_obj = dsl_dir_phys(ds->ds_dir)->dd_origin_obj; snap = kmem_alloc(sizeof (*snap), KM_SLEEP); snap->ds = ds; list_insert_tail(l, snap); obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; } return (0); } static int snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep) { struct promotenode *snap; *spacep = 0; for (snap = list_head(l); snap; snap = list_next(l, snap)) { uint64_t used, comp, uncomp; dsl_deadlist_space_range(&snap->ds->ds_deadlist, mintxg, UINT64_MAX, &used, &comp, &uncomp); *spacep += used; } return (0); } static void snaplist_destroy(list_t *l, const void *tag) { struct promotenode *snap; if (l == NULL || !list_link_active(&l->list_head)) return; while ((snap = list_tail(l)) != NULL) { list_remove(l, snap); dsl_dataset_rele(snap->ds, tag); kmem_free(snap, sizeof (*snap)); } list_destroy(l); } static int promote_hold(dsl_dataset_promote_arg_t *ddpa, dsl_pool_t *dp, const void *tag) { int error; dsl_dir_t *dd; struct promotenode *snap; error = dsl_dataset_hold(dp, ddpa->ddpa_clonename, tag, &ddpa->ddpa_clone); if (error != 0) return (error); dd = ddpa->ddpa_clone->ds_dir; if (ddpa->ddpa_clone->ds_is_snapshot || !dsl_dir_is_clone(dd)) { dsl_dataset_rele(ddpa->ddpa_clone, tag); return (SET_ERROR(EINVAL)); } error = snaplist_make(dp, 0, dsl_dir_phys(dd)->dd_origin_obj, &ddpa->shared_snaps, tag); if (error != 0) goto out; error = snaplist_make(dp, 0, ddpa->ddpa_clone->ds_object, &ddpa->clone_snaps, tag); if (error != 0) goto out; snap = list_head(&ddpa->shared_snaps); ASSERT3U(snap->ds->ds_object, ==, dsl_dir_phys(dd)->dd_origin_obj); error = snaplist_make(dp, dsl_dir_phys(dd)->dd_origin_obj, dsl_dir_phys(snap->ds->ds_dir)->dd_head_dataset_obj, &ddpa->origin_snaps, tag); if (error != 0) goto out; if (dsl_dir_phys(snap->ds->ds_dir)->dd_origin_obj != 0) { error = dsl_dataset_hold_obj(dp, dsl_dir_phys(snap->ds->ds_dir)->dd_origin_obj, tag, &ddpa->origin_origin); if (error != 0) goto out; } out: if (error != 0) promote_rele(ddpa, tag); return (error); } static void promote_rele(dsl_dataset_promote_arg_t *ddpa, const void *tag) { snaplist_destroy(&ddpa->shared_snaps, tag); snaplist_destroy(&ddpa->clone_snaps, tag); snaplist_destroy(&ddpa->origin_snaps, tag); if (ddpa->origin_origin != NULL) dsl_dataset_rele(ddpa->origin_origin, tag); dsl_dataset_rele(ddpa->ddpa_clone, tag); } /* * Promote a clone. * * If it fails due to a conflicting snapshot name, "conflsnap" will be filled * in with the name. (It must be at least ZFS_MAX_DATASET_NAME_LEN bytes long.) */ int dsl_dataset_promote(const char *name, char *conflsnap) { dsl_dataset_promote_arg_t ddpa = { 0 }; uint64_t numsnaps; int error; nvpair_t *snap_pair; objset_t *os; /* * We will modify space proportional to the number of * snapshots. Compute numsnaps. */ error = dmu_objset_hold(name, FTAG, &os); if (error != 0) return (error); error = zap_count(dmu_objset_pool(os)->dp_meta_objset, dsl_dataset_phys(dmu_objset_ds(os))->ds_snapnames_zapobj, &numsnaps); dmu_objset_rele(os, FTAG); if (error != 0) return (error); ddpa.ddpa_clonename = name; ddpa.err_ds = fnvlist_alloc(); ddpa.cr = CRED(); ddpa.proc = curproc; error = dsl_sync_task(name, dsl_dataset_promote_check, dsl_dataset_promote_sync, &ddpa, 2 + numsnaps, ZFS_SPACE_CHECK_RESERVED); /* * Return the first conflicting snapshot found. */ snap_pair = nvlist_next_nvpair(ddpa.err_ds, NULL); if (snap_pair != NULL && conflsnap != NULL) (void) strlcpy(conflsnap, nvpair_name(snap_pair), ZFS_MAX_DATASET_NAME_LEN); fnvlist_free(ddpa.err_ds); return (error); } int dsl_dataset_clone_swap_check_impl(dsl_dataset_t *clone, dsl_dataset_t *origin_head, boolean_t force, void *owner, dmu_tx_t *tx) { /* * "slack" factor for received datasets with refquota set on them. * See the bottom of this function for details on its use. */ uint64_t refquota_slack = (uint64_t)DMU_MAX_ACCESS * spa_asize_inflation; int64_t unused_refres_delta; /* they should both be heads */ if (clone->ds_is_snapshot || origin_head->ds_is_snapshot) return (SET_ERROR(EINVAL)); /* if we are not forcing, the branch point should be just before them */ if (!force && clone->ds_prev != origin_head->ds_prev) return (SET_ERROR(EINVAL)); /* clone should be the clone (unless they are unrelated) */ if (clone->ds_prev != NULL && clone->ds_prev != clone->ds_dir->dd_pool->dp_origin_snap && origin_head->ds_dir != clone->ds_prev->ds_dir) return (SET_ERROR(EINVAL)); /* the clone should be a child of the origin */ if (clone->ds_dir->dd_parent != origin_head->ds_dir) return (SET_ERROR(EINVAL)); /* origin_head shouldn't be modified unless 'force' */ if (!force && dsl_dataset_modified_since_snap(origin_head, origin_head->ds_prev)) return (SET_ERROR(ETXTBSY)); /* origin_head should have no long holds (e.g. is not mounted) */ if (dsl_dataset_handoff_check(origin_head, owner, tx)) return (SET_ERROR(EBUSY)); /* check amount of any unconsumed refreservation */ unused_refres_delta = (int64_t)MIN(origin_head->ds_reserved, dsl_dataset_phys(origin_head)->ds_unique_bytes) - (int64_t)MIN(origin_head->ds_reserved, dsl_dataset_phys(clone)->ds_unique_bytes); if (unused_refres_delta > 0 && unused_refres_delta > dsl_dir_space_available(origin_head->ds_dir, NULL, 0, TRUE)) return (SET_ERROR(ENOSPC)); /* * The clone can't be too much over the head's refquota. * * To ensure that the entire refquota can be used, we allow one * transaction to exceed the refquota. Therefore, this check * needs to also allow for the space referenced to be more than the * refquota. The maximum amount of space that one transaction can use * on disk is DMU_MAX_ACCESS * spa_asize_inflation. Allowing this * overage ensures that we are able to receive a filesystem that * exceeds the refquota on the source system. * * So that overage is the refquota_slack we use below. */ if (origin_head->ds_quota != 0 && dsl_dataset_phys(clone)->ds_referenced_bytes > origin_head->ds_quota + refquota_slack) return (SET_ERROR(EDQUOT)); return (0); } static void dsl_dataset_swap_remap_deadlists(dsl_dataset_t *clone, dsl_dataset_t *origin, dmu_tx_t *tx) { uint64_t clone_remap_dl_obj, origin_remap_dl_obj; dsl_pool_t *dp = dmu_tx_pool(tx); ASSERT(dsl_pool_sync_context(dp)); clone_remap_dl_obj = dsl_dataset_get_remap_deadlist_object(clone); origin_remap_dl_obj = dsl_dataset_get_remap_deadlist_object(origin); if (clone_remap_dl_obj != 0) { dsl_deadlist_close(&clone->ds_remap_deadlist); dsl_dataset_unset_remap_deadlist_object(clone, tx); } if (origin_remap_dl_obj != 0) { dsl_deadlist_close(&origin->ds_remap_deadlist); dsl_dataset_unset_remap_deadlist_object(origin, tx); } if (clone_remap_dl_obj != 0) { dsl_dataset_set_remap_deadlist_object(origin, clone_remap_dl_obj, tx); dsl_deadlist_open(&origin->ds_remap_deadlist, dp->dp_meta_objset, clone_remap_dl_obj); } if (origin_remap_dl_obj != 0) { dsl_dataset_set_remap_deadlist_object(clone, origin_remap_dl_obj, tx); dsl_deadlist_open(&clone->ds_remap_deadlist, dp->dp_meta_objset, origin_remap_dl_obj); } } void dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone, dsl_dataset_t *origin_head, dmu_tx_t *tx) { dsl_pool_t *dp = dmu_tx_pool(tx); int64_t unused_refres_delta; ASSERT(clone->ds_reserved == 0); /* * NOTE: On DEBUG kernels there could be a race between this and * the check function if spa_asize_inflation is adjusted... */ ASSERT(origin_head->ds_quota == 0 || dsl_dataset_phys(clone)->ds_unique_bytes <= origin_head->ds_quota + DMU_MAX_ACCESS * spa_asize_inflation); ASSERT3P(clone->ds_prev, ==, origin_head->ds_prev); dsl_dir_cancel_waiters(origin_head->ds_dir); /* * Swap per-dataset feature flags. */ for (spa_feature_t f = 0; f < SPA_FEATURES; f++) { if (!(spa_feature_table[f].fi_flags & ZFEATURE_FLAG_PER_DATASET)) { ASSERT(!dsl_dataset_feature_is_active(clone, f)); ASSERT(!dsl_dataset_feature_is_active(origin_head, f)); continue; } boolean_t clone_inuse = dsl_dataset_feature_is_active(clone, f); void *clone_feature = clone->ds_feature[f]; boolean_t origin_head_inuse = dsl_dataset_feature_is_active(origin_head, f); void *origin_head_feature = origin_head->ds_feature[f]; if (clone_inuse) dsl_dataset_deactivate_feature_impl(clone, f, tx); if (origin_head_inuse) dsl_dataset_deactivate_feature_impl(origin_head, f, tx); if (clone_inuse) { dsl_dataset_activate_feature(origin_head->ds_object, f, clone_feature, tx); origin_head->ds_feature[f] = clone_feature; } if (origin_head_inuse) { dsl_dataset_activate_feature(clone->ds_object, f, origin_head_feature, tx); clone->ds_feature[f] = origin_head_feature; } } dmu_buf_will_dirty(clone->ds_dbuf, tx); dmu_buf_will_dirty(origin_head->ds_dbuf, tx); if (clone->ds_objset != NULL) { dmu_objset_evict(clone->ds_objset); clone->ds_objset = NULL; } if (origin_head->ds_objset != NULL) { dmu_objset_evict(origin_head->ds_objset); origin_head->ds_objset = NULL; } unused_refres_delta = (int64_t)MIN(origin_head->ds_reserved, dsl_dataset_phys(origin_head)->ds_unique_bytes) - (int64_t)MIN(origin_head->ds_reserved, dsl_dataset_phys(clone)->ds_unique_bytes); /* * Reset origin's unique bytes. */ { dsl_dataset_t *origin = clone->ds_prev; uint64_t comp, uncomp; dmu_buf_will_dirty(origin->ds_dbuf, tx); dsl_deadlist_space_range(&clone->ds_deadlist, dsl_dataset_phys(origin)->ds_prev_snap_txg, UINT64_MAX, &dsl_dataset_phys(origin)->ds_unique_bytes, &comp, &uncomp); } /* swap blkptrs */ { rrw_enter(&clone->ds_bp_rwlock, RW_WRITER, FTAG); rrw_enter(&origin_head->ds_bp_rwlock, RW_WRITER, FTAG); blkptr_t tmp; tmp = dsl_dataset_phys(origin_head)->ds_bp; dsl_dataset_phys(origin_head)->ds_bp = dsl_dataset_phys(clone)->ds_bp; dsl_dataset_phys(clone)->ds_bp = tmp; rrw_exit(&origin_head->ds_bp_rwlock, FTAG); rrw_exit(&clone->ds_bp_rwlock, FTAG); } /* set dd_*_bytes */ { int64_t dused, dcomp, duncomp; uint64_t cdl_used, cdl_comp, cdl_uncomp; uint64_t odl_used, odl_comp, odl_uncomp; ASSERT3U(dsl_dir_phys(clone->ds_dir)-> dd_used_breakdown[DD_USED_SNAP], ==, 0); dsl_deadlist_space(&clone->ds_deadlist, &cdl_used, &cdl_comp, &cdl_uncomp); dsl_deadlist_space(&origin_head->ds_deadlist, &odl_used, &odl_comp, &odl_uncomp); dused = dsl_dataset_phys(clone)->ds_referenced_bytes + cdl_used - (dsl_dataset_phys(origin_head)->ds_referenced_bytes + odl_used); dcomp = dsl_dataset_phys(clone)->ds_compressed_bytes + cdl_comp - (dsl_dataset_phys(origin_head)->ds_compressed_bytes + odl_comp); duncomp = dsl_dataset_phys(clone)->ds_uncompressed_bytes + cdl_uncomp - (dsl_dataset_phys(origin_head)->ds_uncompressed_bytes + odl_uncomp); dsl_dir_diduse_space(origin_head->ds_dir, DD_USED_HEAD, dused, dcomp, duncomp, tx); dsl_dir_diduse_space(clone->ds_dir, DD_USED_HEAD, -dused, -dcomp, -duncomp, tx); /* * The difference in the space used by snapshots is the * difference in snapshot space due to the head's * deadlist (since that's the only thing that's * changing that affects the snapused). */ dsl_deadlist_space_range(&clone->ds_deadlist, origin_head->ds_dir->dd_origin_txg, UINT64_MAX, &cdl_used, &cdl_comp, &cdl_uncomp); dsl_deadlist_space_range(&origin_head->ds_deadlist, origin_head->ds_dir->dd_origin_txg, UINT64_MAX, &odl_used, &odl_comp, &odl_uncomp); dsl_dir_transfer_space(origin_head->ds_dir, cdl_used - odl_used, DD_USED_HEAD, DD_USED_SNAP, tx); } /* swap ds_*_bytes */ SWITCH64(dsl_dataset_phys(origin_head)->ds_referenced_bytes, dsl_dataset_phys(clone)->ds_referenced_bytes); SWITCH64(dsl_dataset_phys(origin_head)->ds_compressed_bytes, dsl_dataset_phys(clone)->ds_compressed_bytes); SWITCH64(dsl_dataset_phys(origin_head)->ds_uncompressed_bytes, dsl_dataset_phys(clone)->ds_uncompressed_bytes); SWITCH64(dsl_dataset_phys(origin_head)->ds_unique_bytes, dsl_dataset_phys(clone)->ds_unique_bytes); /* apply any parent delta for change in unconsumed refreservation */ dsl_dir_diduse_space(origin_head->ds_dir, DD_USED_REFRSRV, unused_refres_delta, 0, 0, tx); /* * Swap deadlists. */ dsl_deadlist_close(&clone->ds_deadlist); dsl_deadlist_close(&origin_head->ds_deadlist); SWITCH64(dsl_dataset_phys(origin_head)->ds_deadlist_obj, dsl_dataset_phys(clone)->ds_deadlist_obj); dsl_deadlist_open(&clone->ds_deadlist, dp->dp_meta_objset, dsl_dataset_phys(clone)->ds_deadlist_obj); dsl_deadlist_open(&origin_head->ds_deadlist, dp->dp_meta_objset, dsl_dataset_phys(origin_head)->ds_deadlist_obj); dsl_dataset_swap_remap_deadlists(clone, origin_head, tx); /* * If there is a bookmark at the origin, its "next dataset" is * changing, so we need to reset its FBN. */ dsl_bookmark_next_changed(origin_head, origin_head->ds_prev, tx); dsl_scan_ds_clone_swapped(origin_head, clone, tx); /* * Destroy any livelists associated with the clone or the origin, * since after the swap the corresponding livelists are no longer * valid. */ dsl_dir_remove_livelist(clone->ds_dir, tx, B_TRUE); dsl_dir_remove_livelist(origin_head->ds_dir, tx, B_TRUE); spa_history_log_internal_ds(clone, "clone swap", tx, "parent=%s", origin_head->ds_dir->dd_myname); } /* * Given a pool name and a dataset object number in that pool, * return the name of that dataset. */ int dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf) { dsl_pool_t *dp; dsl_dataset_t *ds; int error; error = dsl_pool_hold(pname, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold_obj(dp, obj, FTAG, &ds); if (error == 0) { dsl_dataset_name(ds, buf); dsl_dataset_rele(ds, FTAG); } dsl_pool_rele(dp, FTAG); return (error); } int dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota, uint64_t asize, uint64_t inflight, uint64_t *used, uint64_t *ref_rsrv) { int error = 0; ASSERT3S(asize, >, 0); /* * *ref_rsrv is the portion of asize that will come from any * unconsumed refreservation space. */ *ref_rsrv = 0; mutex_enter(&ds->ds_lock); /* * Make a space adjustment for reserved bytes. */ if (ds->ds_reserved > dsl_dataset_phys(ds)->ds_unique_bytes) { ASSERT3U(*used, >=, ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes); *used -= (ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes); *ref_rsrv = asize - MIN(asize, parent_delta(ds, asize + inflight)); } if (!check_quota || ds->ds_quota == 0) { mutex_exit(&ds->ds_lock); return (0); } /* * If they are requesting more space, and our current estimate * is over quota, they get to try again unless the actual * on-disk is over quota and there are no pending changes (which * may free up space for us). */ if (dsl_dataset_phys(ds)->ds_referenced_bytes + inflight >= ds->ds_quota) { if (inflight > 0 || dsl_dataset_phys(ds)->ds_referenced_bytes < ds->ds_quota) error = SET_ERROR(ERESTART); else error = SET_ERROR(EDQUOT); } mutex_exit(&ds->ds_lock); return (error); } typedef struct dsl_dataset_set_qr_arg { const char *ddsqra_name; zprop_source_t ddsqra_source; uint64_t ddsqra_value; } dsl_dataset_set_qr_arg_t; static int dsl_dataset_set_refquota_check(void *arg, dmu_tx_t *tx) { dsl_dataset_set_qr_arg_t *ddsqra = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *ds; int error; uint64_t newval; if (spa_version(dp->dp_spa) < SPA_VERSION_REFQUOTA) return (SET_ERROR(ENOTSUP)); error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds); if (error != 0) return (error); if (ds->ds_is_snapshot) { dsl_dataset_rele(ds, FTAG); return (SET_ERROR(EINVAL)); } error = dsl_prop_predict(ds->ds_dir, zfs_prop_to_name(ZFS_PROP_REFQUOTA), ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval); if (error != 0) { dsl_dataset_rele(ds, FTAG); return (error); } if (newval == 0) { dsl_dataset_rele(ds, FTAG); return (0); } if (newval < dsl_dataset_phys(ds)->ds_referenced_bytes || newval < ds->ds_reserved) { dsl_dataset_rele(ds, FTAG); return (SET_ERROR(ENOSPC)); } dsl_dataset_rele(ds, FTAG); return (0); } static void dsl_dataset_set_refquota_sync(void *arg, dmu_tx_t *tx) { dsl_dataset_set_qr_arg_t *ddsqra = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *ds = NULL; uint64_t newval; VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds)); dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_REFQUOTA), ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1, &ddsqra->ddsqra_value, tx); VERIFY0(dsl_prop_get_int_ds(ds, zfs_prop_to_name(ZFS_PROP_REFQUOTA), &newval)); if (ds->ds_quota != newval) { dmu_buf_will_dirty(ds->ds_dbuf, tx); ds->ds_quota = newval; } dsl_dataset_rele(ds, FTAG); } int dsl_dataset_set_refquota(const char *dsname, zprop_source_t source, uint64_t refquota) { dsl_dataset_set_qr_arg_t ddsqra; ddsqra.ddsqra_name = dsname; ddsqra.ddsqra_source = source; ddsqra.ddsqra_value = refquota; return (dsl_sync_task(dsname, dsl_dataset_set_refquota_check, dsl_dataset_set_refquota_sync, &ddsqra, 0, ZFS_SPACE_CHECK_EXTRA_RESERVED)); } static int dsl_dataset_set_refreservation_check(void *arg, dmu_tx_t *tx) { dsl_dataset_set_qr_arg_t *ddsqra = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *ds; int error; uint64_t newval, unique; if (spa_version(dp->dp_spa) < SPA_VERSION_REFRESERVATION) return (SET_ERROR(ENOTSUP)); error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds); if (error != 0) return (error); if (ds->ds_is_snapshot) { dsl_dataset_rele(ds, FTAG); return (SET_ERROR(EINVAL)); } error = dsl_prop_predict(ds->ds_dir, zfs_prop_to_name(ZFS_PROP_REFRESERVATION), ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval); if (error != 0) { dsl_dataset_rele(ds, FTAG); return (error); } /* * If we are doing the preliminary check in open context, the * space estimates may be inaccurate. */ if (!dmu_tx_is_syncing(tx)) { dsl_dataset_rele(ds, FTAG); return (0); } mutex_enter(&ds->ds_lock); if (!DS_UNIQUE_IS_ACCURATE(ds)) dsl_dataset_recalc_head_uniq(ds); unique = dsl_dataset_phys(ds)->ds_unique_bytes; mutex_exit(&ds->ds_lock); if (MAX(unique, newval) > MAX(unique, ds->ds_reserved)) { uint64_t delta = MAX(unique, newval) - MAX(unique, ds->ds_reserved); if (delta > dsl_dir_space_available(ds->ds_dir, NULL, 0, B_TRUE) || (ds->ds_quota > 0 && newval > ds->ds_quota)) { dsl_dataset_rele(ds, FTAG); return (SET_ERROR(ENOSPC)); } } dsl_dataset_rele(ds, FTAG); return (0); } void dsl_dataset_set_refreservation_sync_impl(dsl_dataset_t *ds, zprop_source_t source, uint64_t value, dmu_tx_t *tx) { uint64_t newval; uint64_t unique; int64_t delta; dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_REFRESERVATION), source, sizeof (value), 1, &value, tx); VERIFY0(dsl_prop_get_int_ds(ds, zfs_prop_to_name(ZFS_PROP_REFRESERVATION), &newval)); dmu_buf_will_dirty(ds->ds_dbuf, tx); mutex_enter(&ds->ds_dir->dd_lock); mutex_enter(&ds->ds_lock); ASSERT(DS_UNIQUE_IS_ACCURATE(ds)); unique = dsl_dataset_phys(ds)->ds_unique_bytes; delta = MAX(0, (int64_t)(newval - unique)) - MAX(0, (int64_t)(ds->ds_reserved - unique)); ds->ds_reserved = newval; mutex_exit(&ds->ds_lock); dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, delta, 0, 0, tx); mutex_exit(&ds->ds_dir->dd_lock); } static void dsl_dataset_set_refreservation_sync(void *arg, dmu_tx_t *tx) { dsl_dataset_set_qr_arg_t *ddsqra = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *ds = NULL; VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds)); dsl_dataset_set_refreservation_sync_impl(ds, ddsqra->ddsqra_source, ddsqra->ddsqra_value, tx); dsl_dataset_rele(ds, FTAG); } int dsl_dataset_set_refreservation(const char *dsname, zprop_source_t source, uint64_t refreservation) { dsl_dataset_set_qr_arg_t ddsqra; ddsqra.ddsqra_name = dsname; ddsqra.ddsqra_source = source; ddsqra.ddsqra_value = refreservation; return (dsl_sync_task(dsname, dsl_dataset_set_refreservation_check, dsl_dataset_set_refreservation_sync, &ddsqra, 0, ZFS_SPACE_CHECK_EXTRA_RESERVED)); } typedef struct dsl_dataset_set_compression_arg { const char *ddsca_name; zprop_source_t ddsca_source; uint64_t ddsca_value; } dsl_dataset_set_compression_arg_t; static int dsl_dataset_set_compression_check(void *arg, dmu_tx_t *tx) { dsl_dataset_set_compression_arg_t *ddsca = arg; dsl_pool_t *dp = dmu_tx_pool(tx); uint64_t compval = ZIO_COMPRESS_ALGO(ddsca->ddsca_value); spa_feature_t f = zio_compress_to_feature(compval); if (f == SPA_FEATURE_NONE) return (SET_ERROR(EINVAL)); if (!spa_feature_is_enabled(dp->dp_spa, f)) return (SET_ERROR(ENOTSUP)); return (0); } static void dsl_dataset_set_compression_sync(void *arg, dmu_tx_t *tx) { dsl_dataset_set_compression_arg_t *ddsca = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *ds = NULL; uint64_t compval = ZIO_COMPRESS_ALGO(ddsca->ddsca_value); spa_feature_t f = zio_compress_to_feature(compval); ASSERT3S(f, !=, SPA_FEATURE_NONE); ASSERT3S(spa_feature_table[f].fi_type, ==, ZFEATURE_TYPE_BOOLEAN); VERIFY0(dsl_dataset_hold(dp, ddsca->ddsca_name, FTAG, &ds)); if (zfeature_active(f, ds->ds_feature[f]) != B_TRUE) { ds->ds_feature_activation[f] = (void *)B_TRUE; dsl_dataset_activate_feature(ds->ds_object, f, ds->ds_feature_activation[f], tx); ds->ds_feature[f] = ds->ds_feature_activation[f]; } dsl_dataset_rele(ds, FTAG); } int dsl_dataset_set_compression(const char *dsname, zprop_source_t source, uint64_t compression) { dsl_dataset_set_compression_arg_t ddsca; /* * The sync task is only required for zstd in order to activate * the feature flag when the property is first set. */ if (ZIO_COMPRESS_ALGO(compression) != ZIO_COMPRESS_ZSTD) return (0); ddsca.ddsca_name = dsname; ddsca.ddsca_source = source; ddsca.ddsca_value = compression; return (dsl_sync_task(dsname, dsl_dataset_set_compression_check, dsl_dataset_set_compression_sync, &ddsca, 0, ZFS_SPACE_CHECK_EXTRA_RESERVED)); } /* * Return (in *usedp) the amount of space referenced by "new" that was not * referenced at the time the bookmark corresponds to. "New" may be a * snapshot or a head. The bookmark must be before new, in * new's filesystem (or its origin) -- caller verifies this. * * The written space is calculated by considering two components: First, we * ignore any freed space, and calculate the written as new's used space * minus old's used space. Next, we add in the amount of space that was freed * between the two time points, thus reducing new's used space relative to * old's. Specifically, this is the space that was born before * zbm_creation_txg, and freed before new (ie. on new's deadlist or a * previous deadlist). * * space freed [---------------------] * snapshots ---O-------O--------O-------O------ * bookmark new * * Note, the bookmark's zbm_*_bytes_refd must be valid, but if the HAS_FBN * flag is not set, we will calculate the freed_before_next based on the * next snapshot's deadlist, rather than using zbm_*_freed_before_next_snap. */ static int dsl_dataset_space_written_impl(zfs_bookmark_phys_t *bmp, dsl_dataset_t *new, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) { int err = 0; dsl_pool_t *dp = new->ds_dir->dd_pool; ASSERT(dsl_pool_config_held(dp)); if (dsl_dataset_is_snapshot(new)) { ASSERT3U(bmp->zbm_creation_txg, <, dsl_dataset_phys(new)->ds_creation_txg); } *usedp = 0; *usedp += dsl_dataset_phys(new)->ds_referenced_bytes; *usedp -= bmp->zbm_referenced_bytes_refd; *compp = 0; *compp += dsl_dataset_phys(new)->ds_compressed_bytes; *compp -= bmp->zbm_compressed_bytes_refd; *uncompp = 0; *uncompp += dsl_dataset_phys(new)->ds_uncompressed_bytes; *uncompp -= bmp->zbm_uncompressed_bytes_refd; dsl_dataset_t *snap = new; while (dsl_dataset_phys(snap)->ds_prev_snap_txg > bmp->zbm_creation_txg) { uint64_t used, comp, uncomp; dsl_deadlist_space_range(&snap->ds_deadlist, 0, bmp->zbm_creation_txg, &used, &comp, &uncomp); *usedp += used; *compp += comp; *uncompp += uncomp; uint64_t snapobj = dsl_dataset_phys(snap)->ds_prev_snap_obj; if (snap != new) dsl_dataset_rele(snap, FTAG); err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &snap); if (err != 0) break; } /* * We might not have the FBN if we are calculating written from * a snapshot (because we didn't know the correct "next" snapshot * until now). */ if (bmp->zbm_flags & ZBM_FLAG_HAS_FBN) { *usedp += bmp->zbm_referenced_freed_before_next_snap; *compp += bmp->zbm_compressed_freed_before_next_snap; *uncompp += bmp->zbm_uncompressed_freed_before_next_snap; } else { ASSERT3U(dsl_dataset_phys(snap)->ds_prev_snap_txg, ==, bmp->zbm_creation_txg); uint64_t used, comp, uncomp; dsl_deadlist_space(&snap->ds_deadlist, &used, &comp, &uncomp); *usedp += used; *compp += comp; *uncompp += uncomp; } if (snap != new) dsl_dataset_rele(snap, FTAG); return (err); } /* * Return (in *usedp) the amount of space written in new that was not * present at the time the bookmark corresponds to. New may be a * snapshot or the head. Old must be a bookmark before new, in * new's filesystem (or its origin) -- caller verifies this. */ int dsl_dataset_space_written_bookmark(zfs_bookmark_phys_t *bmp, dsl_dataset_t *new, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) { if (!(bmp->zbm_flags & ZBM_FLAG_HAS_FBN)) return (SET_ERROR(ENOTSUP)); return (dsl_dataset_space_written_impl(bmp, new, usedp, compp, uncompp)); } /* * Return (in *usedp) the amount of space written in new that is not * present in oldsnap. New may be a snapshot or the head. Old must be * a snapshot before new, in new's filesystem (or its origin). If not then * fail and return EINVAL. */ int dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) { if (!dsl_dataset_is_before(new, oldsnap, 0)) return (SET_ERROR(EINVAL)); zfs_bookmark_phys_t zbm = { 0 }; dsl_dataset_phys_t *dsp = dsl_dataset_phys(oldsnap); zbm.zbm_guid = dsp->ds_guid; zbm.zbm_creation_txg = dsp->ds_creation_txg; zbm.zbm_creation_time = dsp->ds_creation_time; zbm.zbm_referenced_bytes_refd = dsp->ds_referenced_bytes; zbm.zbm_compressed_bytes_refd = dsp->ds_compressed_bytes; zbm.zbm_uncompressed_bytes_refd = dsp->ds_uncompressed_bytes; /* * If oldsnap is the origin (or origin's origin, ...) of new, * we can't easily calculate the effective FBN. Therefore, * we do not set ZBM_FLAG_HAS_FBN, so that the _impl will calculate * it relative to the correct "next": the next snapshot towards "new", * rather than the next snapshot in oldsnap's dsl_dir. */ return (dsl_dataset_space_written_impl(&zbm, new, usedp, compp, uncompp)); } /* * Return (in *usedp) the amount of space that will be reclaimed if firstsnap, * lastsnap, and all snapshots in between are deleted. * * blocks that would be freed [---------------------------] * snapshots ---O-------O--------O-------O--------O * firstsnap lastsnap * * This is the set of blocks that were born after the snap before firstsnap, * (birth > firstsnap->prev_snap_txg) and died before the snap after the * last snap (ie, is on lastsnap->ds_next->ds_deadlist or an earlier deadlist). * We calculate this by iterating over the relevant deadlists (from the snap * after lastsnap, backward to the snap after firstsnap), summing up the * space on the deadlist that was born after the snap before firstsnap. */ int dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap, dsl_dataset_t *lastsnap, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) { int err = 0; uint64_t snapobj; dsl_pool_t *dp = firstsnap->ds_dir->dd_pool; ASSERT(firstsnap->ds_is_snapshot); ASSERT(lastsnap->ds_is_snapshot); /* * Check that the snapshots are in the same dsl_dir, and firstsnap * is before lastsnap. */ if (firstsnap->ds_dir != lastsnap->ds_dir || dsl_dataset_phys(firstsnap)->ds_creation_txg > dsl_dataset_phys(lastsnap)->ds_creation_txg) return (SET_ERROR(EINVAL)); *usedp = *compp = *uncompp = 0; snapobj = dsl_dataset_phys(lastsnap)->ds_next_snap_obj; while (snapobj != firstsnap->ds_object) { dsl_dataset_t *ds; uint64_t used, comp, uncomp; err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &ds); if (err != 0) break; dsl_deadlist_space_range(&ds->ds_deadlist, dsl_dataset_phys(firstsnap)->ds_prev_snap_txg, UINT64_MAX, &used, &comp, &uncomp); *usedp += used; *compp += comp; *uncompp += uncomp; snapobj = dsl_dataset_phys(ds)->ds_prev_snap_obj; ASSERT3U(snapobj, !=, 0); dsl_dataset_rele(ds, FTAG); } return (err); } /* * Return TRUE if 'earlier' is an earlier snapshot in 'later's timeline. * For example, they could both be snapshots of the same filesystem, and * 'earlier' is before 'later'. Or 'earlier' could be the origin of * 'later's filesystem. Or 'earlier' could be an older snapshot in the origin's * filesystem. Or 'earlier' could be the origin's origin. * * If non-zero, earlier_txg is used instead of earlier's ds_creation_txg. */ boolean_t dsl_dataset_is_before(dsl_dataset_t *later, dsl_dataset_t *earlier, uint64_t earlier_txg) { dsl_pool_t *dp = later->ds_dir->dd_pool; int error; boolean_t ret; ASSERT(dsl_pool_config_held(dp)); ASSERT(earlier->ds_is_snapshot || earlier_txg != 0); if (earlier_txg == 0) earlier_txg = dsl_dataset_phys(earlier)->ds_creation_txg; if (later->ds_is_snapshot && earlier_txg >= dsl_dataset_phys(later)->ds_creation_txg) return (B_FALSE); if (later->ds_dir == earlier->ds_dir) return (B_TRUE); /* * We check dd_origin_obj explicitly here rather than using * dsl_dir_is_clone() so that we will return TRUE if "earlier" * is $ORIGIN@$ORIGIN. dsl_dataset_space_written() depends on * this behavior. */ if (dsl_dir_phys(later->ds_dir)->dd_origin_obj == 0) return (B_FALSE); dsl_dataset_t *origin; error = dsl_dataset_hold_obj(dp, dsl_dir_phys(later->ds_dir)->dd_origin_obj, FTAG, &origin); if (error != 0) return (B_FALSE); if (dsl_dataset_phys(origin)->ds_creation_txg == earlier_txg && origin->ds_dir == earlier->ds_dir) { dsl_dataset_rele(origin, FTAG); return (B_TRUE); } ret = dsl_dataset_is_before(origin, earlier, earlier_txg); dsl_dataset_rele(origin, FTAG); return (ret); } void dsl_dataset_zapify(dsl_dataset_t *ds, dmu_tx_t *tx) { objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; dmu_object_zapify(mos, ds->ds_object, DMU_OT_DSL_DATASET, tx); } boolean_t dsl_dataset_is_zapified(dsl_dataset_t *ds) { dmu_object_info_t doi; dmu_object_info_from_db(ds->ds_dbuf, &doi); return (doi.doi_type == DMU_OTN_ZAP_METADATA); } boolean_t dsl_dataset_has_resume_receive_state(dsl_dataset_t *ds) { return (dsl_dataset_is_zapified(ds) && zap_contains(ds->ds_dir->dd_pool->dp_meta_objset, ds->ds_object, DS_FIELD_RESUME_TOGUID) == 0); } uint64_t dsl_dataset_get_remap_deadlist_object(dsl_dataset_t *ds) { uint64_t remap_deadlist_obj; int err; if (!dsl_dataset_is_zapified(ds)) return (0); err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset, ds->ds_object, DS_FIELD_REMAP_DEADLIST, sizeof (remap_deadlist_obj), 1, &remap_deadlist_obj); if (err != 0) { VERIFY3S(err, ==, ENOENT); return (0); } ASSERT(remap_deadlist_obj != 0); return (remap_deadlist_obj); } boolean_t dsl_dataset_remap_deadlist_exists(dsl_dataset_t *ds) { EQUIV(dsl_deadlist_is_open(&ds->ds_remap_deadlist), dsl_dataset_get_remap_deadlist_object(ds) != 0); return (dsl_deadlist_is_open(&ds->ds_remap_deadlist)); } static void dsl_dataset_set_remap_deadlist_object(dsl_dataset_t *ds, uint64_t obj, dmu_tx_t *tx) { ASSERT(obj != 0); dsl_dataset_zapify(ds, tx); VERIFY0(zap_add(ds->ds_dir->dd_pool->dp_meta_objset, ds->ds_object, DS_FIELD_REMAP_DEADLIST, sizeof (obj), 1, &obj, tx)); } static void dsl_dataset_unset_remap_deadlist_object(dsl_dataset_t *ds, dmu_tx_t *tx) { VERIFY0(zap_remove(ds->ds_dir->dd_pool->dp_meta_objset, ds->ds_object, DS_FIELD_REMAP_DEADLIST, tx)); } void dsl_dataset_destroy_remap_deadlist(dsl_dataset_t *ds, dmu_tx_t *tx) { uint64_t remap_deadlist_object; spa_t *spa = ds->ds_dir->dd_pool->dp_spa; ASSERT(dmu_tx_is_syncing(tx)); ASSERT(dsl_dataset_remap_deadlist_exists(ds)); remap_deadlist_object = ds->ds_remap_deadlist.dl_object; dsl_deadlist_close(&ds->ds_remap_deadlist); dsl_deadlist_free(spa_meta_objset(spa), remap_deadlist_object, tx); dsl_dataset_unset_remap_deadlist_object(ds, tx); spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); } void dsl_dataset_create_remap_deadlist(dsl_dataset_t *ds, dmu_tx_t *tx) { uint64_t remap_deadlist_obj; spa_t *spa = ds->ds_dir->dd_pool->dp_spa; ASSERT(dmu_tx_is_syncing(tx)); ASSERT(MUTEX_HELD(&ds->ds_remap_deadlist_lock)); /* * Currently we only create remap deadlists when there are indirect * vdevs with referenced mappings. */ ASSERT(spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL)); remap_deadlist_obj = dsl_deadlist_clone( &ds->ds_deadlist, UINT64_MAX, dsl_dataset_phys(ds)->ds_prev_snap_obj, tx); dsl_dataset_set_remap_deadlist_object(ds, remap_deadlist_obj, tx); dsl_deadlist_open(&ds->ds_remap_deadlist, spa_meta_objset(spa), remap_deadlist_obj); spa_feature_incr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); } void dsl_dataset_activate_redaction(dsl_dataset_t *ds, uint64_t *redact_snaps, uint64_t num_redact_snaps, dmu_tx_t *tx) { uint64_t dsobj = ds->ds_object; struct feature_type_uint64_array_arg *ftuaa = kmem_zalloc(sizeof (*ftuaa), KM_SLEEP); ftuaa->length = (int64_t)num_redact_snaps; if (num_redact_snaps > 0) { ftuaa->array = kmem_alloc(num_redact_snaps * sizeof (uint64_t), KM_SLEEP); memcpy(ftuaa->array, redact_snaps, num_redact_snaps * sizeof (uint64_t)); } dsl_dataset_activate_feature(dsobj, SPA_FEATURE_REDACTED_DATASETS, ftuaa, tx); ds->ds_feature[SPA_FEATURE_REDACTED_DATASETS] = ftuaa; } /* * Find and return (in *oldest_dsobj) the oldest snapshot of the dsobj * dataset whose birth time is >= min_txg. */ int dsl_dataset_oldest_snapshot(spa_t *spa, uint64_t head_ds, uint64_t min_txg, uint64_t *oldest_dsobj) { dsl_dataset_t *ds; dsl_pool_t *dp = spa->spa_dsl_pool; int error = dsl_dataset_hold_obj(dp, head_ds, FTAG, &ds); if (error != 0) return (error); uint64_t prev_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; uint64_t prev_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; while (prev_obj != 0 && min_txg < prev_obj_txg) { dsl_dataset_rele(ds, FTAG); if ((error = dsl_dataset_hold_obj(dp, prev_obj, FTAG, &ds)) != 0) return (error); prev_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; prev_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; } *oldest_dsobj = ds->ds_object; dsl_dataset_rele(ds, FTAG); return (0); } ZFS_MODULE_PARAM(zfs, zfs_, max_recordsize, UINT, ZMOD_RW, "Max allowed record size"); ZFS_MODULE_PARAM(zfs, zfs_, allow_redacted_dataset_mount, INT, ZMOD_RW, "Allow mounting of redacted datasets"); ZFS_MODULE_PARAM(zfs, zfs_, snapshot_history_enabled, INT, ZMOD_RW, "Include snapshot events in pool history/events"); EXPORT_SYMBOL(dsl_dataset_hold); EXPORT_SYMBOL(dsl_dataset_hold_flags); EXPORT_SYMBOL(dsl_dataset_hold_obj); EXPORT_SYMBOL(dsl_dataset_hold_obj_flags); EXPORT_SYMBOL(dsl_dataset_own); EXPORT_SYMBOL(dsl_dataset_own_obj); EXPORT_SYMBOL(dsl_dataset_name); EXPORT_SYMBOL(dsl_dataset_rele); EXPORT_SYMBOL(dsl_dataset_rele_flags); EXPORT_SYMBOL(dsl_dataset_disown); EXPORT_SYMBOL(dsl_dataset_tryown); EXPORT_SYMBOL(dsl_dataset_create_sync); EXPORT_SYMBOL(dsl_dataset_create_sync_dd); EXPORT_SYMBOL(dsl_dataset_snapshot_check); EXPORT_SYMBOL(dsl_dataset_snapshot_sync); EXPORT_SYMBOL(dsl_dataset_promote); EXPORT_SYMBOL(dsl_dataset_user_hold); EXPORT_SYMBOL(dsl_dataset_user_release); EXPORT_SYMBOL(dsl_dataset_get_holds); EXPORT_SYMBOL(dsl_dataset_get_blkptr); EXPORT_SYMBOL(dsl_dataset_get_spa); EXPORT_SYMBOL(dsl_dataset_modified_since_snap); EXPORT_SYMBOL(dsl_dataset_space_written); EXPORT_SYMBOL(dsl_dataset_space_wouldfree); EXPORT_SYMBOL(dsl_dataset_sync); EXPORT_SYMBOL(dsl_dataset_block_born); EXPORT_SYMBOL(dsl_dataset_block_kill); EXPORT_SYMBOL(dsl_dataset_dirty); EXPORT_SYMBOL(dsl_dataset_stats); EXPORT_SYMBOL(dsl_dataset_fast_stat); EXPORT_SYMBOL(dsl_dataset_space); EXPORT_SYMBOL(dsl_dataset_fsid_guid); EXPORT_SYMBOL(dsl_dsobj_to_dsname); EXPORT_SYMBOL(dsl_dataset_check_quota); EXPORT_SYMBOL(dsl_dataset_clone_swap_check_impl); EXPORT_SYMBOL(dsl_dataset_clone_swap_sync_impl); diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c index 5d4311ff4557..5ca918a87ee1 100644 --- a/module/zfs/dsl_pool.c +++ b/module/zfs/dsl_pool.c @@ -1,1494 +1,1493 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2016 Nexenta Systems, Inc. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * ZFS Write Throttle * ------------------ * * ZFS must limit the rate of incoming writes to the rate at which it is able * to sync data modifications to the backend storage. Throttling by too much * creates an artificial limit; throttling by too little can only be sustained * for short periods and would lead to highly lumpy performance. On a per-pool * basis, ZFS tracks the amount of modified (dirty) data. As operations change * data, the amount of dirty data increases; as ZFS syncs out data, the amount * of dirty data decreases. When the amount of dirty data exceeds a * predetermined threshold further modifications are blocked until the amount * of dirty data decreases (as data is synced out). * * The limit on dirty data is tunable, and should be adjusted according to * both the IO capacity and available memory of the system. The larger the * window, the more ZFS is able to aggregate and amortize metadata (and data) * changes. However, memory is a limited resource, and allowing for more dirty * data comes at the cost of keeping other useful data in memory (for example * ZFS data cached by the ARC). * * Implementation * * As buffers are modified dsl_pool_willuse_space() increments both the per- * txg (dp_dirty_pertxg[]) and poolwide (dp_dirty_total) accounting of * dirty space used; dsl_pool_dirty_space() decrements those values as data * is synced out from dsl_pool_sync(). While only the poolwide value is * relevant, the per-txg value is useful for debugging. The tunable * zfs_dirty_data_max determines the dirty space limit. Once that value is * exceeded, new writes are halted until space frees up. * * The zfs_dirty_data_sync_percent tunable dictates the threshold at which we * ensure that there is a txg syncing (see the comment in txg.c for a full * description of transaction group stages). * * The IO scheduler uses both the dirty space limit and current amount of * dirty data as inputs. Those values affect the number of concurrent IOs ZFS * issues. See the comment in vdev_queue.c for details of the IO scheduler. * * The delay is also calculated based on the amount of dirty data. See the * comment above dmu_tx_delay() for details. */ /* * zfs_dirty_data_max will be set to zfs_dirty_data_max_percent% of all memory, * capped at zfs_dirty_data_max_max. It can also be overridden with a module * parameter. */ uint64_t zfs_dirty_data_max = 0; uint64_t zfs_dirty_data_max_max = 0; uint_t zfs_dirty_data_max_percent = 10; uint_t zfs_dirty_data_max_max_percent = 25; /* * The upper limit of TX_WRITE log data. Write operations are throttled * when approaching the limit until log data is cleared out after txg sync. * It only counts TX_WRITE log with WR_COPIED or WR_NEED_COPY. */ uint64_t zfs_wrlog_data_max = 0; /* * If there's at least this much dirty data (as a percentage of * zfs_dirty_data_max), push out a txg. This should be less than * zfs_vdev_async_write_active_min_dirty_percent. */ static uint_t zfs_dirty_data_sync_percent = 20; /* * Once there is this amount of dirty data, the dmu_tx_delay() will kick in * and delay each transaction. * This value should be >= zfs_vdev_async_write_active_max_dirty_percent. */ uint_t zfs_delay_min_dirty_percent = 60; /* * This controls how quickly the delay approaches infinity. * Larger values cause it to delay more for a given amount of dirty data. * Therefore larger values will cause there to be less dirty data for a * given throughput. * * For the smoothest delay, this value should be about 1 billion divided * by the maximum number of operations per second. This will smoothly * handle between 10x and 1/10th this number. * * Note: zfs_delay_scale * zfs_dirty_data_max must be < 2^64, due to the * multiply in dmu_tx_delay(). */ uint64_t zfs_delay_scale = 1000 * 1000 * 1000 / 2000; /* * This determines the number of threads used by the dp_sync_taskq. */ static int zfs_sync_taskq_batch_pct = 75; /* * These tunables determine the behavior of how zil_itxg_clean() is * called via zil_clean() in the context of spa_sync(). When an itxg * list needs to be cleaned, TQ_NOSLEEP will be used when dispatching. * If the dispatch fails, the call to zil_itxg_clean() will occur * synchronously in the context of spa_sync(), which can negatively * impact the performance of spa_sync() (e.g. in the case of the itxg * list having a large number of itxs that needs to be cleaned). * * Thus, these tunables can be used to manipulate the behavior of the * taskq used by zil_clean(); they determine the number of taskq entries * that are pre-populated when the taskq is first created (via the * "zfs_zil_clean_taskq_minalloc" tunable) and the maximum number of * taskq entries that are cached after an on-demand allocation (via the * "zfs_zil_clean_taskq_maxalloc"). * * The idea being, we want to try reasonably hard to ensure there will * already be a taskq entry pre-allocated by the time that it is needed * by zil_clean(). This way, we can avoid the possibility of an * on-demand allocation of a new taskq entry from failing, which would * result in zil_itxg_clean() being called synchronously from zil_clean() * (which can adversely affect performance of spa_sync()). * * Additionally, the number of threads used by the taskq can be * configured via the "zfs_zil_clean_taskq_nthr_pct" tunable. */ static int zfs_zil_clean_taskq_nthr_pct = 100; static int zfs_zil_clean_taskq_minalloc = 1024; static int zfs_zil_clean_taskq_maxalloc = 1024 * 1024; int dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp) { uint64_t obj; int err; err = zap_lookup(dp->dp_meta_objset, dsl_dir_phys(dp->dp_root_dir)->dd_child_dir_zapobj, name, sizeof (obj), 1, &obj); if (err) return (err); return (dsl_dir_hold_obj(dp, obj, name, dp, ddp)); } static dsl_pool_t * dsl_pool_open_impl(spa_t *spa, uint64_t txg) { dsl_pool_t *dp; blkptr_t *bp = spa_get_rootblkptr(spa); dp = kmem_zalloc(sizeof (dsl_pool_t), KM_SLEEP); dp->dp_spa = spa; dp->dp_meta_rootbp = *bp; rrw_init(&dp->dp_config_rwlock, B_TRUE); txg_init(dp, txg); mmp_init(spa); txg_list_create(&dp->dp_dirty_datasets, spa, offsetof(dsl_dataset_t, ds_dirty_link)); txg_list_create(&dp->dp_dirty_zilogs, spa, offsetof(zilog_t, zl_dirty_link)); txg_list_create(&dp->dp_dirty_dirs, spa, offsetof(dsl_dir_t, dd_dirty_link)); txg_list_create(&dp->dp_sync_tasks, spa, offsetof(dsl_sync_task_t, dst_node)); txg_list_create(&dp->dp_early_sync_tasks, spa, offsetof(dsl_sync_task_t, dst_node)); dp->dp_sync_taskq = taskq_create("dp_sync_taskq", zfs_sync_taskq_batch_pct, minclsyspri, 1, INT_MAX, TASKQ_THREADS_CPU_PCT); dp->dp_zil_clean_taskq = taskq_create("dp_zil_clean_taskq", zfs_zil_clean_taskq_nthr_pct, minclsyspri, zfs_zil_clean_taskq_minalloc, zfs_zil_clean_taskq_maxalloc, TASKQ_PREPOPULATE | TASKQ_THREADS_CPU_PCT); mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&dp->dp_spaceavail_cv, NULL, CV_DEFAULT, NULL); aggsum_init(&dp->dp_wrlog_total, 0); for (int i = 0; i < TXG_SIZE; i++) { aggsum_init(&dp->dp_wrlog_pertxg[i], 0); } dp->dp_zrele_taskq = taskq_create("z_zrele", 100, defclsyspri, boot_ncpus * 8, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT); dp->dp_unlinked_drain_taskq = taskq_create("z_unlinked_drain", 100, defclsyspri, boot_ncpus, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT); return (dp); } int dsl_pool_init(spa_t *spa, uint64_t txg, dsl_pool_t **dpp) { int err; dsl_pool_t *dp = dsl_pool_open_impl(spa, txg); /* * Initialize the caller's dsl_pool_t structure before we actually open * the meta objset. This is done because a self-healing write zio may * be issued as part of dmu_objset_open_impl() and the spa needs its * dsl_pool_t initialized in order to handle the write. */ *dpp = dp; err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp, &dp->dp_meta_objset); if (err != 0) { dsl_pool_close(dp); *dpp = NULL; } return (err); } int dsl_pool_open(dsl_pool_t *dp) { int err; dsl_dir_t *dd; dsl_dataset_t *ds; uint64_t obj; rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1, &dp->dp_root_dir_obj); if (err) goto out; err = dsl_dir_hold_obj(dp, dp->dp_root_dir_obj, NULL, dp, &dp->dp_root_dir); if (err) goto out; err = dsl_pool_open_special_dir(dp, MOS_DIR_NAME, &dp->dp_mos_dir); if (err) goto out; if (spa_version(dp->dp_spa) >= SPA_VERSION_ORIGIN) { err = dsl_pool_open_special_dir(dp, ORIGIN_DIR_NAME, &dd); if (err) goto out; err = dsl_dataset_hold_obj(dp, dsl_dir_phys(dd)->dd_head_dataset_obj, FTAG, &ds); if (err == 0) { err = dsl_dataset_hold_obj(dp, dsl_dataset_phys(ds)->ds_prev_snap_obj, dp, &dp->dp_origin_snap); dsl_dataset_rele(ds, FTAG); } dsl_dir_rele(dd, dp); if (err) goto out; } if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) { err = dsl_pool_open_special_dir(dp, FREE_DIR_NAME, &dp->dp_free_dir); if (err) goto out; err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj); if (err) goto out; VERIFY0(bpobj_open(&dp->dp_free_bpobj, dp->dp_meta_objset, obj)); } if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_OBSOLETE_COUNTS)) { err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_OBSOLETE_BPOBJ, sizeof (uint64_t), 1, &obj); if (err == 0) { VERIFY0(bpobj_open(&dp->dp_obsolete_bpobj, dp->dp_meta_objset, obj)); } else if (err == ENOENT) { /* * We might not have created the remap bpobj yet. */ - err = 0; } else { goto out; } } /* * Note: errors ignored, because the these special dirs, used for * space accounting, are only created on demand. */ (void) dsl_pool_open_special_dir(dp, LEAK_DIR_NAME, &dp->dp_leak_dir); if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY)) { err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1, &dp->dp_bptree_obj); if (err != 0) goto out; } if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMPTY_BPOBJ)) { err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1, &dp->dp_empty_bpobj); if (err != 0) goto out; } err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_TMP_USERREFS, sizeof (uint64_t), 1, &dp->dp_tmp_userrefs_obj); if (err == ENOENT) err = 0; if (err) goto out; err = dsl_scan_init(dp, dp->dp_tx.tx_open_txg); out: rrw_exit(&dp->dp_config_rwlock, FTAG); return (err); } void dsl_pool_close(dsl_pool_t *dp) { /* * Drop our references from dsl_pool_open(). * * Since we held the origin_snap from "syncing" context (which * includes pool-opening context), it actually only got a "ref" * and not a hold, so just drop that here. */ if (dp->dp_origin_snap != NULL) dsl_dataset_rele(dp->dp_origin_snap, dp); if (dp->dp_mos_dir != NULL) dsl_dir_rele(dp->dp_mos_dir, dp); if (dp->dp_free_dir != NULL) dsl_dir_rele(dp->dp_free_dir, dp); if (dp->dp_leak_dir != NULL) dsl_dir_rele(dp->dp_leak_dir, dp); if (dp->dp_root_dir != NULL) dsl_dir_rele(dp->dp_root_dir, dp); bpobj_close(&dp->dp_free_bpobj); bpobj_close(&dp->dp_obsolete_bpobj); /* undo the dmu_objset_open_impl(mos) from dsl_pool_open() */ if (dp->dp_meta_objset != NULL) dmu_objset_evict(dp->dp_meta_objset); txg_list_destroy(&dp->dp_dirty_datasets); txg_list_destroy(&dp->dp_dirty_zilogs); txg_list_destroy(&dp->dp_sync_tasks); txg_list_destroy(&dp->dp_early_sync_tasks); txg_list_destroy(&dp->dp_dirty_dirs); taskq_destroy(dp->dp_zil_clean_taskq); taskq_destroy(dp->dp_sync_taskq); /* * We can't set retry to TRUE since we're explicitly specifying * a spa to flush. This is good enough; any missed buffers for * this spa won't cause trouble, and they'll eventually fall * out of the ARC just like any other unused buffer. */ arc_flush(dp->dp_spa, FALSE); mmp_fini(dp->dp_spa); txg_fini(dp); dsl_scan_fini(dp); dmu_buf_user_evict_wait(); rrw_destroy(&dp->dp_config_rwlock); mutex_destroy(&dp->dp_lock); cv_destroy(&dp->dp_spaceavail_cv); ASSERT0(aggsum_value(&dp->dp_wrlog_total)); aggsum_fini(&dp->dp_wrlog_total); for (int i = 0; i < TXG_SIZE; i++) { ASSERT0(aggsum_value(&dp->dp_wrlog_pertxg[i])); aggsum_fini(&dp->dp_wrlog_pertxg[i]); } taskq_destroy(dp->dp_unlinked_drain_taskq); taskq_destroy(dp->dp_zrele_taskq); if (dp->dp_blkstats != NULL) vmem_free(dp->dp_blkstats, sizeof (zfs_all_blkstats_t)); kmem_free(dp, sizeof (dsl_pool_t)); } void dsl_pool_create_obsolete_bpobj(dsl_pool_t *dp, dmu_tx_t *tx) { uint64_t obj; /* * Currently, we only create the obsolete_bpobj where there are * indirect vdevs with referenced mappings. */ ASSERT(spa_feature_is_active(dp->dp_spa, SPA_FEATURE_DEVICE_REMOVAL)); /* create and open the obsolete_bpobj */ obj = bpobj_alloc(dp->dp_meta_objset, SPA_OLD_MAXBLOCKSIZE, tx); VERIFY0(bpobj_open(&dp->dp_obsolete_bpobj, dp->dp_meta_objset, obj)); VERIFY0(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_OBSOLETE_BPOBJ, sizeof (uint64_t), 1, &obj, tx)); spa_feature_incr(dp->dp_spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); } void dsl_pool_destroy_obsolete_bpobj(dsl_pool_t *dp, dmu_tx_t *tx) { spa_feature_decr(dp->dp_spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); VERIFY0(zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_OBSOLETE_BPOBJ, tx)); bpobj_free(dp->dp_meta_objset, dp->dp_obsolete_bpobj.bpo_object, tx); bpobj_close(&dp->dp_obsolete_bpobj); } dsl_pool_t * dsl_pool_create(spa_t *spa, nvlist_t *zplprops __attribute__((unused)), dsl_crypto_params_t *dcp, uint64_t txg) { int err; dsl_pool_t *dp = dsl_pool_open_impl(spa, txg); dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg); #ifdef _KERNEL objset_t *os; #else objset_t *os __attribute__((unused)); #endif dsl_dataset_t *ds; uint64_t obj; rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); /* create and open the MOS (meta-objset) */ dp->dp_meta_objset = dmu_objset_create_impl(spa, NULL, &dp->dp_meta_rootbp, DMU_OST_META, tx); spa->spa_meta_objset = dp->dp_meta_objset; /* create the pool directory */ err = zap_create_claim(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_OT_OBJECT_DIRECTORY, DMU_OT_NONE, 0, tx); ASSERT0(err); /* Initialize scan structures */ VERIFY0(dsl_scan_init(dp, txg)); /* create and open the root dir */ dp->dp_root_dir_obj = dsl_dir_create_sync(dp, NULL, NULL, tx); VERIFY0(dsl_dir_hold_obj(dp, dp->dp_root_dir_obj, NULL, dp, &dp->dp_root_dir)); /* create and open the meta-objset dir */ (void) dsl_dir_create_sync(dp, dp->dp_root_dir, MOS_DIR_NAME, tx); VERIFY0(dsl_pool_open_special_dir(dp, MOS_DIR_NAME, &dp->dp_mos_dir)); if (spa_version(spa) >= SPA_VERSION_DEADLISTS) { /* create and open the free dir */ (void) dsl_dir_create_sync(dp, dp->dp_root_dir, FREE_DIR_NAME, tx); VERIFY0(dsl_pool_open_special_dir(dp, FREE_DIR_NAME, &dp->dp_free_dir)); /* create and open the free_bplist */ obj = bpobj_alloc(dp->dp_meta_objset, SPA_OLD_MAXBLOCKSIZE, tx); VERIFY(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx) == 0); VERIFY0(bpobj_open(&dp->dp_free_bpobj, dp->dp_meta_objset, obj)); } if (spa_version(spa) >= SPA_VERSION_DSL_SCRUB) dsl_pool_create_origin(dp, tx); /* * Some features may be needed when creating the root dataset, so we * create the feature objects here. */ if (spa_version(spa) >= SPA_VERSION_FEATURES) spa_feature_create_zap_objects(spa, tx); if (dcp != NULL && dcp->cp_crypt != ZIO_CRYPT_OFF && dcp->cp_crypt != ZIO_CRYPT_INHERIT) spa_feature_enable(spa, SPA_FEATURE_ENCRYPTION, tx); /* create the root dataset */ obj = dsl_dataset_create_sync_dd(dp->dp_root_dir, NULL, dcp, 0, tx); /* create the root objset */ VERIFY0(dsl_dataset_hold_obj_flags(dp, obj, DS_HOLD_FLAG_DECRYPT, FTAG, &ds)); rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); os = dmu_objset_create_impl(dp->dp_spa, ds, dsl_dataset_get_blkptr(ds), DMU_OST_ZFS, tx); rrw_exit(&ds->ds_bp_rwlock, FTAG); #ifdef _KERNEL zfs_create_fs(os, kcred, zplprops, tx); #endif dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); dmu_tx_commit(tx); rrw_exit(&dp->dp_config_rwlock, FTAG); return (dp); } /* * Account for the meta-objset space in its placeholder dsl_dir. */ void dsl_pool_mos_diduse_space(dsl_pool_t *dp, int64_t used, int64_t comp, int64_t uncomp) { ASSERT3U(comp, ==, uncomp); /* it's all metadata */ mutex_enter(&dp->dp_lock); dp->dp_mos_used_delta += used; dp->dp_mos_compressed_delta += comp; dp->dp_mos_uncompressed_delta += uncomp; mutex_exit(&dp->dp_lock); } static void dsl_pool_sync_mos(dsl_pool_t *dp, dmu_tx_t *tx) { zio_t *zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); dmu_objset_sync(dp->dp_meta_objset, zio, tx); VERIFY0(zio_wait(zio)); dmu_objset_sync_done(dp->dp_meta_objset, tx); taskq_wait(dp->dp_sync_taskq); multilist_destroy(&dp->dp_meta_objset->os_synced_dnodes); dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", ""); spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp); } static void dsl_pool_dirty_delta(dsl_pool_t *dp, int64_t delta) { ASSERT(MUTEX_HELD(&dp->dp_lock)); if (delta < 0) ASSERT3U(-delta, <=, dp->dp_dirty_total); dp->dp_dirty_total += delta; /* * Note: we signal even when increasing dp_dirty_total. * This ensures forward progress -- each thread wakes the next waiter. */ if (dp->dp_dirty_total < zfs_dirty_data_max) cv_signal(&dp->dp_spaceavail_cv); } void dsl_pool_wrlog_count(dsl_pool_t *dp, int64_t size, uint64_t txg) { ASSERT3S(size, >=, 0); aggsum_add(&dp->dp_wrlog_pertxg[txg & TXG_MASK], size); aggsum_add(&dp->dp_wrlog_total, size); /* Choose a value slightly bigger than min dirty sync bytes */ uint64_t sync_min = zfs_wrlog_data_max * (zfs_dirty_data_sync_percent + 10) / 200; if (aggsum_compare(&dp->dp_wrlog_pertxg[txg & TXG_MASK], sync_min) > 0) txg_kick(dp, txg); } boolean_t dsl_pool_need_wrlog_delay(dsl_pool_t *dp) { uint64_t delay_min_bytes = zfs_wrlog_data_max * zfs_delay_min_dirty_percent / 100; return (aggsum_compare(&dp->dp_wrlog_total, delay_min_bytes) > 0); } static void dsl_pool_wrlog_clear(dsl_pool_t *dp, uint64_t txg) { int64_t delta; delta = -(int64_t)aggsum_value(&dp->dp_wrlog_pertxg[txg & TXG_MASK]); aggsum_add(&dp->dp_wrlog_pertxg[txg & TXG_MASK], delta); aggsum_add(&dp->dp_wrlog_total, delta); /* Compact per-CPU sums after the big change. */ (void) aggsum_value(&dp->dp_wrlog_pertxg[txg & TXG_MASK]); (void) aggsum_value(&dp->dp_wrlog_total); } #ifdef ZFS_DEBUG static boolean_t dsl_early_sync_task_verify(dsl_pool_t *dp, uint64_t txg) { spa_t *spa = dp->dp_spa; vdev_t *rvd = spa->spa_root_vdev; for (uint64_t c = 0; c < rvd->vdev_children; c++) { vdev_t *vd = rvd->vdev_child[c]; txg_list_t *tl = &vd->vdev_ms_list; metaslab_t *ms; for (ms = txg_list_head(tl, TXG_CLEAN(txg)); ms; ms = txg_list_next(tl, ms, TXG_CLEAN(txg))) { VERIFY(range_tree_is_empty(ms->ms_freeing)); VERIFY(range_tree_is_empty(ms->ms_checkpointing)); } } return (B_TRUE); } #else #define dsl_early_sync_task_verify(dp, txg) \ ((void) sizeof (dp), (void) sizeof (txg), B_TRUE) #endif void dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) { zio_t *zio; dmu_tx_t *tx; dsl_dir_t *dd; dsl_dataset_t *ds; objset_t *mos = dp->dp_meta_objset; list_t synced_datasets; list_create(&synced_datasets, sizeof (dsl_dataset_t), offsetof(dsl_dataset_t, ds_synced_link)); tx = dmu_tx_create_assigned(dp, txg); /* * Run all early sync tasks before writing out any dirty blocks. * For more info on early sync tasks see block comment in * dsl_early_sync_task(). */ if (!txg_list_empty(&dp->dp_early_sync_tasks, txg)) { dsl_sync_task_t *dst; ASSERT3U(spa_sync_pass(dp->dp_spa), ==, 1); while ((dst = txg_list_remove(&dp->dp_early_sync_tasks, txg)) != NULL) { ASSERT(dsl_early_sync_task_verify(dp, txg)); dsl_sync_task_sync(dst, tx); } ASSERT(dsl_early_sync_task_verify(dp, txg)); } /* * Write out all dirty blocks of dirty datasets. */ zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) { /* * We must not sync any non-MOS datasets twice, because * we may have taken a snapshot of them. However, we * may sync newly-created datasets on pass 2. */ ASSERT(!list_link_active(&ds->ds_synced_link)); list_insert_tail(&synced_datasets, ds); dsl_dataset_sync(ds, zio, tx); } VERIFY0(zio_wait(zio)); /* * Update the long range free counter after * we're done syncing user data */ mutex_enter(&dp->dp_lock); ASSERT(spa_sync_pass(dp->dp_spa) == 1 || dp->dp_long_free_dirty_pertxg[txg & TXG_MASK] == 0); dp->dp_long_free_dirty_pertxg[txg & TXG_MASK] = 0; mutex_exit(&dp->dp_lock); /* * After the data blocks have been written (ensured by the zio_wait() * above), update the user/group/project space accounting. This happens * in tasks dispatched to dp_sync_taskq, so wait for them before * continuing. */ for (ds = list_head(&synced_datasets); ds != NULL; ds = list_next(&synced_datasets, ds)) { dmu_objset_sync_done(ds->ds_objset, tx); } taskq_wait(dp->dp_sync_taskq); /* * Sync the datasets again to push out the changes due to * userspace updates. This must be done before we process the * sync tasks, so that any snapshots will have the correct * user accounting information (and we won't get confused * about which blocks are part of the snapshot). */ zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) { objset_t *os = ds->ds_objset; ASSERT(list_link_active(&ds->ds_synced_link)); dmu_buf_rele(ds->ds_dbuf, ds); dsl_dataset_sync(ds, zio, tx); /* * Release any key mappings created by calls to * dsl_dataset_dirty() from the userquota accounting * code paths. */ if (os->os_encrypted && !os->os_raw_receive && !os->os_next_write_raw[txg & TXG_MASK]) { ASSERT3P(ds->ds_key_mapping, !=, NULL); key_mapping_rele(dp->dp_spa, ds->ds_key_mapping, ds); } } VERIFY0(zio_wait(zio)); /* * Now that the datasets have been completely synced, we can * clean up our in-memory structures accumulated while syncing: * * - move dead blocks from the pending deadlist and livelists * to the on-disk versions * - release hold from dsl_dataset_dirty() * - release key mapping hold from dsl_dataset_dirty() */ while ((ds = list_remove_head(&synced_datasets)) != NULL) { objset_t *os = ds->ds_objset; if (os->os_encrypted && !os->os_raw_receive && !os->os_next_write_raw[txg & TXG_MASK]) { ASSERT3P(ds->ds_key_mapping, !=, NULL); key_mapping_rele(dp->dp_spa, ds->ds_key_mapping, ds); } dsl_dataset_sync_done(ds, tx); } while ((dd = txg_list_remove(&dp->dp_dirty_dirs, txg)) != NULL) { dsl_dir_sync(dd, tx); } /* * The MOS's space is accounted for in the pool/$MOS * (dp_mos_dir). We can't modify the mos while we're syncing * it, so we remember the deltas and apply them here. */ if (dp->dp_mos_used_delta != 0 || dp->dp_mos_compressed_delta != 0 || dp->dp_mos_uncompressed_delta != 0) { dsl_dir_diduse_space(dp->dp_mos_dir, DD_USED_HEAD, dp->dp_mos_used_delta, dp->dp_mos_compressed_delta, dp->dp_mos_uncompressed_delta, tx); dp->dp_mos_used_delta = 0; dp->dp_mos_compressed_delta = 0; dp->dp_mos_uncompressed_delta = 0; } if (dmu_objset_is_dirty(mos, txg)) { dsl_pool_sync_mos(dp, tx); } /* * We have written all of the accounted dirty data, so our * dp_space_towrite should now be zero. However, some seldom-used * code paths do not adhere to this (e.g. dbuf_undirty()). Shore up * the accounting of any dirtied space now. * * Note that, besides any dirty data from datasets, the amount of * dirty data in the MOS is also accounted by the pool. Therefore, * we want to do this cleanup after dsl_pool_sync_mos() so we don't * attempt to update the accounting for the same dirty data twice. * (i.e. at this point we only update the accounting for the space * that we know that we "leaked"). */ dsl_pool_undirty_space(dp, dp->dp_dirty_pertxg[txg & TXG_MASK], txg); /* * If we modify a dataset in the same txg that we want to destroy it, * its dsl_dir's dd_dbuf will be dirty, and thus have a hold on it. * dsl_dir_destroy_check() will fail if there are unexpected holds. * Therefore, we want to sync the MOS (thus syncing the dd_dbuf * and clearing the hold on it) before we process the sync_tasks. * The MOS data dirtied by the sync_tasks will be synced on the next * pass. */ if (!txg_list_empty(&dp->dp_sync_tasks, txg)) { dsl_sync_task_t *dst; /* * No more sync tasks should have been added while we * were syncing. */ ASSERT3U(spa_sync_pass(dp->dp_spa), ==, 1); while ((dst = txg_list_remove(&dp->dp_sync_tasks, txg)) != NULL) dsl_sync_task_sync(dst, tx); } dmu_tx_commit(tx); DTRACE_PROBE2(dsl_pool_sync__done, dsl_pool_t *dp, dp, uint64_t, txg); } void dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg) { zilog_t *zilog; while ((zilog = txg_list_head(&dp->dp_dirty_zilogs, txg))) { dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os); /* * We don't remove the zilog from the dp_dirty_zilogs * list until after we've cleaned it. This ensures that * callers of zilog_is_dirty() receive an accurate * answer when they are racing with the spa sync thread. */ zil_clean(zilog, txg); (void) txg_list_remove_this(&dp->dp_dirty_zilogs, zilog, txg); ASSERT(!dmu_objset_is_dirty(zilog->zl_os, txg)); dmu_buf_rele(ds->ds_dbuf, zilog); } dsl_pool_wrlog_clear(dp, txg); ASSERT(!dmu_objset_is_dirty(dp->dp_meta_objset, txg)); } /* * TRUE if the current thread is the tx_sync_thread or if we * are being called from SPA context during pool initialization. */ int dsl_pool_sync_context(dsl_pool_t *dp) { return (curthread == dp->dp_tx.tx_sync_thread || spa_is_initializing(dp->dp_spa) || taskq_member(dp->dp_sync_taskq, curthread)); } /* * This function returns the amount of allocatable space in the pool * minus whatever space is currently reserved by ZFS for specific * purposes. Specifically: * * 1] Any reserved SLOP space * 2] Any space used by the checkpoint * 3] Any space used for deferred frees * * The latter 2 are especially important because they are needed to * rectify the SPA's and DMU's different understanding of how much space * is used. Now the DMU is aware of that extra space tracked by the SPA * without having to maintain a separate special dir (e.g similar to * $MOS, $FREEING, and $LEAKED). * * Note: By deferred frees here, we mean the frees that were deferred * in spa_sync() after sync pass 1 (spa_deferred_bpobj), and not the * segments placed in ms_defer trees during metaslab_sync_done(). */ uint64_t dsl_pool_adjustedsize(dsl_pool_t *dp, zfs_space_check_t slop_policy) { spa_t *spa = dp->dp_spa; uint64_t space, resv, adjustedsize; uint64_t spa_deferred_frees = spa->spa_deferred_bpobj.bpo_phys->bpo_bytes; space = spa_get_dspace(spa) - spa_get_checkpoint_space(spa) - spa_deferred_frees; resv = spa_get_slop_space(spa); switch (slop_policy) { case ZFS_SPACE_CHECK_NORMAL: break; case ZFS_SPACE_CHECK_RESERVED: resv >>= 1; break; case ZFS_SPACE_CHECK_EXTRA_RESERVED: resv >>= 2; break; case ZFS_SPACE_CHECK_NONE: resv = 0; break; default: panic("invalid slop policy value: %d", slop_policy); break; } adjustedsize = (space >= resv) ? (space - resv) : 0; return (adjustedsize); } uint64_t dsl_pool_unreserved_space(dsl_pool_t *dp, zfs_space_check_t slop_policy) { uint64_t poolsize = dsl_pool_adjustedsize(dp, slop_policy); uint64_t deferred = metaslab_class_get_deferred(spa_normal_class(dp->dp_spa)); uint64_t quota = (poolsize >= deferred) ? (poolsize - deferred) : 0; return (quota); } uint64_t dsl_pool_deferred_space(dsl_pool_t *dp) { return (metaslab_class_get_deferred(spa_normal_class(dp->dp_spa))); } boolean_t dsl_pool_need_dirty_delay(dsl_pool_t *dp) { uint64_t delay_min_bytes = zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100; mutex_enter(&dp->dp_lock); uint64_t dirty = dp->dp_dirty_total; mutex_exit(&dp->dp_lock); return (dirty > delay_min_bytes); } static boolean_t dsl_pool_need_dirty_sync(dsl_pool_t *dp, uint64_t txg) { ASSERT(MUTEX_HELD(&dp->dp_lock)); uint64_t dirty_min_bytes = zfs_dirty_data_max * zfs_dirty_data_sync_percent / 100; uint64_t dirty = dp->dp_dirty_pertxg[txg & TXG_MASK]; return (dirty > dirty_min_bytes); } void dsl_pool_dirty_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx) { if (space > 0) { mutex_enter(&dp->dp_lock); dp->dp_dirty_pertxg[tx->tx_txg & TXG_MASK] += space; dsl_pool_dirty_delta(dp, space); boolean_t needsync = !dmu_tx_is_syncing(tx) && dsl_pool_need_dirty_sync(dp, tx->tx_txg); mutex_exit(&dp->dp_lock); if (needsync) txg_kick(dp, tx->tx_txg); } } void dsl_pool_undirty_space(dsl_pool_t *dp, int64_t space, uint64_t txg) { ASSERT3S(space, >=, 0); if (space == 0) return; mutex_enter(&dp->dp_lock); if (dp->dp_dirty_pertxg[txg & TXG_MASK] < space) { /* XXX writing something we didn't dirty? */ space = dp->dp_dirty_pertxg[txg & TXG_MASK]; } ASSERT3U(dp->dp_dirty_pertxg[txg & TXG_MASK], >=, space); dp->dp_dirty_pertxg[txg & TXG_MASK] -= space; ASSERT3U(dp->dp_dirty_total, >=, space); dsl_pool_dirty_delta(dp, -space); mutex_exit(&dp->dp_lock); } static int upgrade_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) { dmu_tx_t *tx = arg; dsl_dataset_t *ds, *prev = NULL; int err; err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds); if (err) return (err); while (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) { err = dsl_dataset_hold_obj(dp, dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev); if (err) { dsl_dataset_rele(ds, FTAG); return (err); } if (dsl_dataset_phys(prev)->ds_next_snap_obj != ds->ds_object) break; dsl_dataset_rele(ds, FTAG); ds = prev; prev = NULL; } if (prev == NULL) { prev = dp->dp_origin_snap; /* * The $ORIGIN can't have any data, or the accounting * will be wrong. */ rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); ASSERT0(dsl_dataset_phys(prev)->ds_bp.blk_birth); rrw_exit(&ds->ds_bp_rwlock, FTAG); /* The origin doesn't get attached to itself */ if (ds->ds_object == prev->ds_object) { dsl_dataset_rele(ds, FTAG); return (0); } dmu_buf_will_dirty(ds->ds_dbuf, tx); dsl_dataset_phys(ds)->ds_prev_snap_obj = prev->ds_object; dsl_dataset_phys(ds)->ds_prev_snap_txg = dsl_dataset_phys(prev)->ds_creation_txg; dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx); dsl_dir_phys(ds->ds_dir)->dd_origin_obj = prev->ds_object; dmu_buf_will_dirty(prev->ds_dbuf, tx); dsl_dataset_phys(prev)->ds_num_children++; if (dsl_dataset_phys(ds)->ds_next_snap_obj == 0) { ASSERT(ds->ds_prev == NULL); VERIFY0(dsl_dataset_hold_obj(dp, dsl_dataset_phys(ds)->ds_prev_snap_obj, ds, &ds->ds_prev)); } } ASSERT3U(dsl_dir_phys(ds->ds_dir)->dd_origin_obj, ==, prev->ds_object); ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_obj, ==, prev->ds_object); if (dsl_dataset_phys(prev)->ds_next_clones_obj == 0) { dmu_buf_will_dirty(prev->ds_dbuf, tx); dsl_dataset_phys(prev)->ds_next_clones_obj = zap_create(dp->dp_meta_objset, DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx); } VERIFY0(zap_add_int(dp->dp_meta_objset, dsl_dataset_phys(prev)->ds_next_clones_obj, ds->ds_object, tx)); dsl_dataset_rele(ds, FTAG); if (prev != dp->dp_origin_snap) dsl_dataset_rele(prev, FTAG); return (0); } void dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx) { ASSERT(dmu_tx_is_syncing(tx)); ASSERT(dp->dp_origin_snap != NULL); VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, upgrade_clones_cb, tx, DS_FIND_CHILDREN | DS_FIND_SERIALIZE)); } static int upgrade_dir_clones_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) { dmu_tx_t *tx = arg; objset_t *mos = dp->dp_meta_objset; if (dsl_dir_phys(ds->ds_dir)->dd_origin_obj != 0) { dsl_dataset_t *origin; VERIFY0(dsl_dataset_hold_obj(dp, dsl_dir_phys(ds->ds_dir)->dd_origin_obj, FTAG, &origin)); if (dsl_dir_phys(origin->ds_dir)->dd_clones == 0) { dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx); dsl_dir_phys(origin->ds_dir)->dd_clones = zap_create(mos, DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx); } VERIFY0(zap_add_int(dp->dp_meta_objset, dsl_dir_phys(origin->ds_dir)->dd_clones, ds->ds_object, tx)); dsl_dataset_rele(origin, FTAG); } return (0); } void dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx) { uint64_t obj; ASSERT(dmu_tx_is_syncing(tx)); (void) dsl_dir_create_sync(dp, dp->dp_root_dir, FREE_DIR_NAME, tx); VERIFY0(dsl_pool_open_special_dir(dp, FREE_DIR_NAME, &dp->dp_free_dir)); /* * We can't use bpobj_alloc(), because spa_version() still * returns the old version, and we need a new-version bpobj with * subobj support. So call dmu_object_alloc() directly. */ obj = dmu_object_alloc(dp->dp_meta_objset, DMU_OT_BPOBJ, SPA_OLD_MAXBLOCKSIZE, DMU_OT_BPOBJ_HDR, sizeof (bpobj_phys_t), tx); VERIFY0(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx)); VERIFY0(bpobj_open(&dp->dp_free_bpobj, dp->dp_meta_objset, obj)); VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, upgrade_dir_clones_cb, tx, DS_FIND_CHILDREN | DS_FIND_SERIALIZE)); } void dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx) { uint64_t dsobj; dsl_dataset_t *ds; ASSERT(dmu_tx_is_syncing(tx)); ASSERT(dp->dp_origin_snap == NULL); ASSERT(rrw_held(&dp->dp_config_rwlock, RW_WRITER)); /* create the origin dir, ds, & snap-ds */ dsobj = dsl_dataset_create_sync(dp->dp_root_dir, ORIGIN_DIR_NAME, NULL, 0, kcred, NULL, tx); VERIFY0(dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); dsl_dataset_snapshot_sync_impl(ds, ORIGIN_DIR_NAME, tx); VERIFY0(dsl_dataset_hold_obj(dp, dsl_dataset_phys(ds)->ds_prev_snap_obj, dp, &dp->dp_origin_snap)); dsl_dataset_rele(ds, FTAG); } taskq_t * dsl_pool_zrele_taskq(dsl_pool_t *dp) { return (dp->dp_zrele_taskq); } taskq_t * dsl_pool_unlinked_drain_taskq(dsl_pool_t *dp) { return (dp->dp_unlinked_drain_taskq); } /* * Walk through the pool-wide zap object of temporary snapshot user holds * and release them. */ void dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp) { zap_attribute_t za; zap_cursor_t zc; objset_t *mos = dp->dp_meta_objset; uint64_t zapobj = dp->dp_tmp_userrefs_obj; nvlist_t *holds; if (zapobj == 0) return; ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS); holds = fnvlist_alloc(); for (zap_cursor_init(&zc, mos, zapobj); zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { char *htag; nvlist_t *tags; htag = strchr(za.za_name, '-'); *htag = '\0'; ++htag; if (nvlist_lookup_nvlist(holds, za.za_name, &tags) != 0) { tags = fnvlist_alloc(); fnvlist_add_boolean(tags, htag); fnvlist_add_nvlist(holds, za.za_name, tags); fnvlist_free(tags); } else { fnvlist_add_boolean(tags, htag); } } dsl_dataset_user_release_tmp(dp, holds); fnvlist_free(holds); zap_cursor_fini(&zc); } /* * Create the pool-wide zap object for storing temporary snapshot holds. */ static void dsl_pool_user_hold_create_obj(dsl_pool_t *dp, dmu_tx_t *tx) { objset_t *mos = dp->dp_meta_objset; ASSERT(dp->dp_tmp_userrefs_obj == 0); ASSERT(dmu_tx_is_syncing(tx)); dp->dp_tmp_userrefs_obj = zap_create_link(mos, DMU_OT_USERREFS, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_TMP_USERREFS, tx); } static int dsl_pool_user_hold_rele_impl(dsl_pool_t *dp, uint64_t dsobj, const char *tag, uint64_t now, dmu_tx_t *tx, boolean_t holding) { objset_t *mos = dp->dp_meta_objset; uint64_t zapobj = dp->dp_tmp_userrefs_obj; char *name; int error; ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS); ASSERT(dmu_tx_is_syncing(tx)); /* * If the pool was created prior to SPA_VERSION_USERREFS, the * zap object for temporary holds might not exist yet. */ if (zapobj == 0) { if (holding) { dsl_pool_user_hold_create_obj(dp, tx); zapobj = dp->dp_tmp_userrefs_obj; } else { return (SET_ERROR(ENOENT)); } } name = kmem_asprintf("%llx-%s", (u_longlong_t)dsobj, tag); if (holding) error = zap_add(mos, zapobj, name, 8, 1, &now, tx); else error = zap_remove(mos, zapobj, name, tx); kmem_strfree(name); return (error); } /* * Add a temporary hold for the given dataset object and tag. */ int dsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj, const char *tag, uint64_t now, dmu_tx_t *tx) { return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, now, tx, B_TRUE)); } /* * Release a temporary hold for the given dataset object and tag. */ int dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj, const char *tag, dmu_tx_t *tx) { return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, 0, tx, B_FALSE)); } /* * DSL Pool Configuration Lock * * The dp_config_rwlock protects against changes to DSL state (e.g. dataset * creation / destruction / rename / property setting). It must be held for * read to hold a dataset or dsl_dir. I.e. you must call * dsl_pool_config_enter() or dsl_pool_hold() before calling * dsl_{dataset,dir}_hold{_obj}. In most circumstances, the dp_config_rwlock * must be held continuously until all datasets and dsl_dirs are released. * * The only exception to this rule is that if a "long hold" is placed on * a dataset, then the dp_config_rwlock may be dropped while the dataset * is still held. The long hold will prevent the dataset from being * destroyed -- the destroy will fail with EBUSY. A long hold can be * obtained by calling dsl_dataset_long_hold(), or by "owning" a dataset * (by calling dsl_{dataset,objset}_{try}own{_obj}). * * Legitimate long-holders (including owners) should be long-running, cancelable * tasks that should cause "zfs destroy" to fail. This includes DMU * consumers (i.e. a ZPL filesystem being mounted or ZVOL being open), * "zfs send", and "zfs diff". There are several other long-holders whose * uses are suboptimal (e.g. "zfs promote", and zil_suspend()). * * The usual formula for long-holding would be: * dsl_pool_hold() * dsl_dataset_hold() * ... perform checks ... * dsl_dataset_long_hold() * dsl_pool_rele() * ... perform long-running task ... * dsl_dataset_long_rele() * dsl_dataset_rele() * * Note that when the long hold is released, the dataset is still held but * the pool is not held. The dataset may change arbitrarily during this time * (e.g. it could be destroyed). Therefore you shouldn't do anything to the * dataset except release it. * * Operations generally fall somewhere into the following taxonomy: * * Read-Only Modifying * * Dataset Layer / MOS zfs get zfs destroy * * Individual Dataset read() write() * * * Dataset Layer Operations * * Modifying operations should generally use dsl_sync_task(). The synctask * infrastructure enforces proper locking strategy with respect to the * dp_config_rwlock. See the comment above dsl_sync_task() for details. * * Read-only operations will manually hold the pool, then the dataset, obtain * information from the dataset, then release the pool and dataset. * dmu_objset_{hold,rele}() are convenience routines that also do the pool * hold/rele. * * * Operations On Individual Datasets * * Objects _within_ an objset should only be modified by the current 'owner' * of the objset to prevent incorrect concurrent modification. Thus, use * {dmu_objset,dsl_dataset}_own to mark some entity as the current owner, * and fail with EBUSY if there is already an owner. The owner can then * implement its own locking strategy, independent of the dataset layer's * locking infrastructure. * (E.g., the ZPL has its own set of locks to control concurrency. A regular * vnop will not reach into the dataset layer). * * Ideally, objects would also only be read by the objset’s owner, so that we * don’t observe state mid-modification. * (E.g. the ZPL is creating a new object and linking it into a directory; if * you don’t coordinate with the ZPL to hold ZPL-level locks, you could see an * intermediate state. The ioctl level violates this but in pretty benign * ways, e.g. reading the zpl props object.) */ int dsl_pool_hold(const char *name, const void *tag, dsl_pool_t **dp) { spa_t *spa; int error; error = spa_open(name, &spa, tag); if (error == 0) { *dp = spa_get_dsl(spa); dsl_pool_config_enter(*dp, tag); } return (error); } void dsl_pool_rele(dsl_pool_t *dp, const void *tag) { dsl_pool_config_exit(dp, tag); spa_close(dp->dp_spa, tag); } void dsl_pool_config_enter(dsl_pool_t *dp, const void *tag) { /* * We use a "reentrant" reader-writer lock, but not reentrantly. * * The rrwlock can (with the track_all flag) track all reading threads, * which is very useful for debugging which code path failed to release * the lock, and for verifying that the *current* thread does hold * the lock. * * (Unlike a rwlock, which knows that N threads hold it for * read, but not *which* threads, so rw_held(RW_READER) returns TRUE * if any thread holds it for read, even if this thread doesn't). */ ASSERT(!rrw_held(&dp->dp_config_rwlock, RW_READER)); rrw_enter(&dp->dp_config_rwlock, RW_READER, tag); } void dsl_pool_config_enter_prio(dsl_pool_t *dp, const void *tag) { ASSERT(!rrw_held(&dp->dp_config_rwlock, RW_READER)); rrw_enter_read_prio(&dp->dp_config_rwlock, tag); } void dsl_pool_config_exit(dsl_pool_t *dp, const void *tag) { rrw_exit(&dp->dp_config_rwlock, tag); } boolean_t dsl_pool_config_held(dsl_pool_t *dp) { return (RRW_LOCK_HELD(&dp->dp_config_rwlock)); } boolean_t dsl_pool_config_held_writer(dsl_pool_t *dp) { return (RRW_WRITE_HELD(&dp->dp_config_rwlock)); } EXPORT_SYMBOL(dsl_pool_config_enter); EXPORT_SYMBOL(dsl_pool_config_exit); /* zfs_dirty_data_max_percent only applied at module load in arc_init(). */ ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max_percent, UINT, ZMOD_RD, "Max percent of RAM allowed to be dirty"); /* zfs_dirty_data_max_max_percent only applied at module load in arc_init(). */ ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max_max_percent, UINT, ZMOD_RD, "zfs_dirty_data_max upper bound as % of RAM"); ZFS_MODULE_PARAM(zfs, zfs_, delay_min_dirty_percent, UINT, ZMOD_RW, "Transaction delay threshold"); ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max, U64, ZMOD_RW, "Determines the dirty space limit"); ZFS_MODULE_PARAM(zfs, zfs_, wrlog_data_max, U64, ZMOD_RW, "The size limit of write-transaction zil log data"); /* zfs_dirty_data_max_max only applied at module load in arc_init(). */ ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max_max, U64, ZMOD_RD, "zfs_dirty_data_max upper bound in bytes"); ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_sync_percent, UINT, ZMOD_RW, "Dirty data txg sync threshold as a percentage of zfs_dirty_data_max"); ZFS_MODULE_PARAM(zfs, zfs_, delay_scale, U64, ZMOD_RW, "How quickly delay approaches infinity"); ZFS_MODULE_PARAM(zfs, zfs_, sync_taskq_batch_pct, INT, ZMOD_RW, "Max percent of CPUs that are used to sync dirty data"); ZFS_MODULE_PARAM(zfs_zil, zfs_zil_, clean_taskq_nthr_pct, INT, ZMOD_RW, "Max percent of CPUs that are used per dp_sync_taskq"); ZFS_MODULE_PARAM(zfs_zil, zfs_zil_, clean_taskq_minalloc, INT, ZMOD_RW, "Number of taskq entries that are pre-populated"); ZFS_MODULE_PARAM(zfs_zil, zfs_zil_, clean_taskq_maxalloc, INT, ZMOD_RW, "Max number of taskq entries that are cached"); diff --git a/module/zfs/mmp.c b/module/zfs/mmp.c index 4f0273f3ed86..ef0e01df390f 100644 --- a/module/zfs/mmp.c +++ b/module/zfs/mmp.c @@ -1,747 +1,747 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2017 by Lawrence Livermore National Security, LLC. */ #include #include #include #include #include #include #include #include #include /* * Multi-Modifier Protection (MMP) attempts to prevent a user from importing * or opening a pool on more than one host at a time. In particular, it * prevents "zpool import -f" on a host from succeeding while the pool is * already imported on another host. There are many other ways in which a * device could be used by two hosts for different purposes at the same time * resulting in pool damage. This implementation does not attempt to detect * those cases. * * MMP operates by ensuring there are frequent visible changes on disk (a * "heartbeat") at all times. And by altering the import process to check * for these changes and failing the import when they are detected. This * functionality is enabled by setting the 'multihost' pool property to on. * * Uberblocks written by the txg_sync thread always go into the first * (N-MMP_BLOCKS_PER_LABEL) slots, the remaining slots are reserved for MMP. * They are used to hold uberblocks which are exactly the same as the last * synced uberblock except that the ub_timestamp and mmp_config are frequently * updated. Like all other uberblocks, the slot is written with an embedded * checksum, and slots with invalid checksums are ignored. This provides the * "heartbeat", with no risk of overwriting good uberblocks that must be * preserved, e.g. previous txgs and associated block pointers. * * Three optional fields are added to uberblock structure; ub_mmp_magic, * ub_mmp_config, and ub_mmp_delay. The ub_mmp_magic value allows zfs to tell * whether the other ub_mmp_* fields are valid. The ub_mmp_config field tells * the importing host the settings of zfs_multihost_interval and * zfs_multihost_fail_intervals on the host which last had (or currently has) * the pool imported. These determine how long a host must wait to detect * activity in the pool, before concluding the pool is not in use. The * mmp_delay field is a decaying average of the amount of time between * completion of successive MMP writes, in nanoseconds. It indicates whether * MMP is enabled. * * During import an activity test may now be performed to determine if * the pool is in use. The activity test is typically required if the * ZPOOL_CONFIG_HOSTID does not match the system hostid, the pool state is * POOL_STATE_ACTIVE, and the pool is not a root pool. * * The activity test finds the "best" uberblock (highest txg, timestamp, and, if * ub_mmp_magic is valid, sequence number from ub_mmp_config). It then waits * some time, and finds the "best" uberblock again. If any of the mentioned * fields have different values in the newly read uberblock, the pool is in use * by another host and the import fails. In order to assure the accuracy of the * activity test, the default values result in an activity test duration of 20x * the mmp write interval. * * The duration of the "zpool import" activity test depends on the information * available in the "best" uberblock: * * 1) If uberblock was written by zfs-0.8 or newer and fail_intervals > 0: * ub_mmp_config.fail_intervals * ub_mmp_config.multihost_interval * 2 * * In this case, a weak guarantee is provided. Since the host which last had * the pool imported will suspend the pool if no mmp writes land within * fail_intervals * multihost_interval ms, the absence of writes during that * time means either the pool is not imported, or it is imported but the pool * is suspended and no further writes will occur. * * Note that resuming the suspended pool on the remote host would invalidate * this guarantee, and so it is not allowed. * * The factor of 2 provides a conservative safety factor and derives from * MMP_IMPORT_SAFETY_FACTOR; * * 2) If uberblock was written by zfs-0.8 or newer and fail_intervals == 0: * (ub_mmp_config.multihost_interval + ub_mmp_delay) * * zfs_multihost_import_intervals * * In this case no guarantee can provided. However, as long as some devices * are healthy and connected, it is likely that at least one write will land * within (multihost_interval + mmp_delay) because multihost_interval is * enough time for a write to be attempted to each leaf vdev, and mmp_delay * is enough for one to land, based on past delays. Multiplying by * zfs_multihost_import_intervals provides a conservative safety factor. * * 3) If uberblock was written by zfs-0.7: * (zfs_multihost_interval + ub_mmp_delay) * zfs_multihost_import_intervals * * The same logic as case #2 applies, but we do not know remote tunables. * * We use the local value for zfs_multihost_interval because the original MMP * did not record this value in the uberblock. * * ub_mmp_delay >= (zfs_multihost_interval / leaves), so if the other host * has a much larger zfs_multihost_interval set, ub_mmp_delay will reflect * that. We will have waited enough time for zfs_multihost_import_intervals * writes to be issued and all but one to land. * * single device pool example delays * * import_delay = (1 + 1) * 20 = 40s #defaults, no I/O delay * import_delay = (1 + 10) * 20 = 220s #defaults, 10s I/O delay * import_delay = (10 + 10) * 20 = 400s #10s multihost_interval, * no I/O delay * 100 device pool example delays * * import_delay = (1 + .01) * 20 = 20s #defaults, no I/O delay * import_delay = (1 + 10) * 20 = 220s #defaults, 10s I/O delay * import_delay = (10 + .1) * 20 = 202s #10s multihost_interval, * no I/O delay * * 4) Otherwise, this uberblock was written by a pre-MMP zfs: * zfs_multihost_import_intervals * zfs_multihost_interval * * In this case local tunables are used. By default this product = 10s, long * enough for a pool with any activity at all to write at least one * uberblock. No guarantee can be provided. * * Additionally, the duration is then extended by a random 25% to attempt to to * detect simultaneous imports. For example, if both partner hosts are rebooted * at the same time and automatically attempt to import the pool. */ /* * Used to control the frequency of mmp writes which are performed when the * 'multihost' pool property is on. This is one factor used to determine the * length of the activity check during import. * * On average an mmp write will be issued for each leaf vdev every * zfs_multihost_interval milliseconds. In practice, the observed period can * vary with the I/O load and this observed value is the ub_mmp_delay which is * stored in the uberblock. The minimum allowed value is 100 ms. */ uint64_t zfs_multihost_interval = MMP_DEFAULT_INTERVAL; /* * Used to control the duration of the activity test on import. Smaller values * of zfs_multihost_import_intervals will reduce the import time but increase * the risk of failing to detect an active pool. The total activity check time * is never allowed to drop below one second. A value of 0 is ignored and * treated as if it was set to 1. */ uint_t zfs_multihost_import_intervals = MMP_DEFAULT_IMPORT_INTERVALS; /* * Controls the behavior of the pool when mmp write failures or delays are * detected. * * When zfs_multihost_fail_intervals = 0, mmp write failures or delays are * ignored. The failures will still be reported to the ZED which depending on * its configuration may take action such as suspending the pool or taking a * device offline. * * When zfs_multihost_fail_intervals > 0, the pool will be suspended if * zfs_multihost_fail_intervals * zfs_multihost_interval milliseconds pass * without a successful mmp write. This guarantees the activity test will see * mmp writes if the pool is imported. A value of 1 is ignored and treated as * if it was set to 2, because a single leaf vdev pool will issue a write once * per multihost_interval and thus any variation in latency would cause the * pool to be suspended. */ uint_t zfs_multihost_fail_intervals = MMP_DEFAULT_FAIL_INTERVALS; static const void *const mmp_tag = "mmp_write_uberblock"; static __attribute__((noreturn)) void mmp_thread(void *arg); void mmp_init(spa_t *spa) { mmp_thread_t *mmp = &spa->spa_mmp; mutex_init(&mmp->mmp_thread_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&mmp->mmp_thread_cv, NULL, CV_DEFAULT, NULL); mutex_init(&mmp->mmp_io_lock, NULL, MUTEX_DEFAULT, NULL); mmp->mmp_kstat_id = 1; } void mmp_fini(spa_t *spa) { mmp_thread_t *mmp = &spa->spa_mmp; mutex_destroy(&mmp->mmp_thread_lock); cv_destroy(&mmp->mmp_thread_cv); mutex_destroy(&mmp->mmp_io_lock); } static void mmp_thread_enter(mmp_thread_t *mmp, callb_cpr_t *cpr) { CALLB_CPR_INIT(cpr, &mmp->mmp_thread_lock, callb_generic_cpr, FTAG); mutex_enter(&mmp->mmp_thread_lock); } static void mmp_thread_exit(mmp_thread_t *mmp, kthread_t **mpp, callb_cpr_t *cpr) { ASSERT(*mpp != NULL); *mpp = NULL; cv_broadcast(&mmp->mmp_thread_cv); CALLB_CPR_EXIT(cpr); /* drops &mmp->mmp_thread_lock */ } void mmp_thread_start(spa_t *spa) { mmp_thread_t *mmp = &spa->spa_mmp; if (spa_writeable(spa)) { mutex_enter(&mmp->mmp_thread_lock); if (!mmp->mmp_thread) { mmp->mmp_thread = thread_create(NULL, 0, mmp_thread, spa, 0, &p0, TS_RUN, defclsyspri); zfs_dbgmsg("MMP thread started pool '%s' " "gethrtime %llu", spa_name(spa), gethrtime()); } mutex_exit(&mmp->mmp_thread_lock); } } void mmp_thread_stop(spa_t *spa) { mmp_thread_t *mmp = &spa->spa_mmp; mutex_enter(&mmp->mmp_thread_lock); mmp->mmp_thread_exiting = 1; cv_broadcast(&mmp->mmp_thread_cv); while (mmp->mmp_thread) { cv_wait(&mmp->mmp_thread_cv, &mmp->mmp_thread_lock); } mutex_exit(&mmp->mmp_thread_lock); zfs_dbgmsg("MMP thread stopped pool '%s' gethrtime %llu", spa_name(spa), gethrtime()); ASSERT(mmp->mmp_thread == NULL); mmp->mmp_thread_exiting = 0; } typedef enum mmp_vdev_state_flag { MMP_FAIL_NOT_WRITABLE = (1 << 0), MMP_FAIL_WRITE_PENDING = (1 << 1), } mmp_vdev_state_flag_t; /* * Find a leaf vdev to write an MMP block to. It must not have an outstanding * mmp write (if so a new write will also likely block). If there is no usable * leaf, a nonzero error value is returned. The error value returned is a bit * field. * * MMP_FAIL_WRITE_PENDING One or more leaf vdevs are writeable, but have an * outstanding MMP write. * MMP_FAIL_NOT_WRITABLE One or more leaf vdevs are not writeable. */ static int mmp_next_leaf(spa_t *spa) { vdev_t *leaf; vdev_t *starting_leaf; int fail_mask = 0; ASSERT(MUTEX_HELD(&spa->spa_mmp.mmp_io_lock)); ASSERT(spa_config_held(spa, SCL_STATE, RW_READER)); ASSERT(list_link_active(&spa->spa_leaf_list.list_head) == B_TRUE); ASSERT(!list_is_empty(&spa->spa_leaf_list)); if (spa->spa_mmp.mmp_leaf_last_gen != spa->spa_leaf_list_gen) { spa->spa_mmp.mmp_last_leaf = list_head(&spa->spa_leaf_list); spa->spa_mmp.mmp_leaf_last_gen = spa->spa_leaf_list_gen; } leaf = spa->spa_mmp.mmp_last_leaf; if (leaf == NULL) leaf = list_head(&spa->spa_leaf_list); starting_leaf = leaf; do { leaf = list_next(&spa->spa_leaf_list, leaf); if (leaf == NULL) { leaf = list_head(&spa->spa_leaf_list); ASSERT3P(leaf, !=, NULL); } /* * We skip unwritable, offline, detached, and dRAID spare * devices as they are either not legal targets or the write * may fail or not be seen by other hosts. Skipped dRAID * spares can never be written so the fail mask is not set. */ if (!vdev_writeable(leaf) || leaf->vdev_offline || leaf->vdev_detached) { fail_mask |= MMP_FAIL_NOT_WRITABLE; } else if (leaf->vdev_ops == &vdev_draid_spare_ops) { continue; } else if (leaf->vdev_mmp_pending != 0) { fail_mask |= MMP_FAIL_WRITE_PENDING; } else { spa->spa_mmp.mmp_last_leaf = leaf; return (0); } } while (leaf != starting_leaf); ASSERT(fail_mask); return (fail_mask); } /* * MMP writes are issued on a fixed schedule, but may complete at variable, * much longer, intervals. The mmp_delay captures long periods between * successful writes for any reason, including disk latency, scheduling delays, * etc. * * The mmp_delay is usually calculated as a decaying average, but if the latest * delay is higher we do not average it, so that we do not hide sudden spikes * which the importing host must wait for. * * If writes are occurring frequently, such as due to a high rate of txg syncs, * the mmp_delay could become very small. Since those short delays depend on * activity we cannot count on, we never allow mmp_delay to get lower than rate * expected if only mmp_thread writes occur. * * If an mmp write was skipped or fails, and we have already waited longer than * mmp_delay, we need to update it so the next write reflects the longer delay. * * Do not set mmp_delay if the multihost property is not on, so as not to * trigger an activity check on import. */ static void mmp_delay_update(spa_t *spa, boolean_t write_completed) { mmp_thread_t *mts = &spa->spa_mmp; hrtime_t delay = gethrtime() - mts->mmp_last_write; ASSERT(MUTEX_HELD(&mts->mmp_io_lock)); if (spa_multihost(spa) == B_FALSE) { mts->mmp_delay = 0; return; } if (delay > mts->mmp_delay) mts->mmp_delay = delay; if (write_completed == B_FALSE) return; mts->mmp_last_write = gethrtime(); /* * strictly less than, in case delay was changed above. */ if (delay < mts->mmp_delay) { hrtime_t min_delay = MSEC2NSEC(MMP_INTERVAL_OK(zfs_multihost_interval)) / MAX(1, vdev_count_leaves(spa)); mts->mmp_delay = MAX(((delay + mts->mmp_delay * 127) / 128), min_delay); } } static void mmp_write_done(zio_t *zio) { spa_t *spa = zio->io_spa; vdev_t *vd = zio->io_vd; mmp_thread_t *mts = zio->io_private; mutex_enter(&mts->mmp_io_lock); uint64_t mmp_kstat_id = vd->vdev_mmp_kstat_id; hrtime_t mmp_write_duration = gethrtime() - vd->vdev_mmp_pending; mmp_delay_update(spa, (zio->io_error == 0)); vd->vdev_mmp_pending = 0; vd->vdev_mmp_kstat_id = 0; mutex_exit(&mts->mmp_io_lock); spa_config_exit(spa, SCL_STATE, mmp_tag); spa_mmp_history_set(spa, mmp_kstat_id, zio->io_error, mmp_write_duration); abd_free(zio->io_abd); } /* * When the uberblock on-disk is updated by a spa_sync, * creating a new "best" uberblock, update the one stored * in the mmp thread state, used for mmp writes. */ void mmp_update_uberblock(spa_t *spa, uberblock_t *ub) { mmp_thread_t *mmp = &spa->spa_mmp; mutex_enter(&mmp->mmp_io_lock); mmp->mmp_ub = *ub; mmp->mmp_seq = 1; mmp->mmp_ub.ub_timestamp = gethrestime_sec(); mmp_delay_update(spa, B_TRUE); mutex_exit(&mmp->mmp_io_lock); } /* * Choose a random vdev, label, and MMP block, and write over it * with a copy of the last-synced uberblock, whose timestamp * has been updated to reflect that the pool is in use. */ static void mmp_write_uberblock(spa_t *spa) { int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL; mmp_thread_t *mmp = &spa->spa_mmp; uberblock_t *ub; vdev_t *vd = NULL; int label, error; uint64_t offset; hrtime_t lock_acquire_time = gethrtime(); spa_config_enter(spa, SCL_STATE, mmp_tag, RW_READER); lock_acquire_time = gethrtime() - lock_acquire_time; if (lock_acquire_time > (MSEC2NSEC(MMP_MIN_INTERVAL) / 10)) zfs_dbgmsg("MMP SCL_STATE acquisition pool '%s' took %llu ns " "gethrtime %llu", spa_name(spa), lock_acquire_time, gethrtime()); mutex_enter(&mmp->mmp_io_lock); error = mmp_next_leaf(spa); /* * spa_mmp_history has two types of entries: * Issued MMP write: records time issued, error status, etc. * Skipped MMP write: an MMP write could not be issued because no * suitable leaf vdev was available. See comment above struct * spa_mmp_history for details. */ if (error) { mmp_delay_update(spa, B_FALSE); if (mmp->mmp_skip_error == error) { spa_mmp_history_set_skip(spa, mmp->mmp_kstat_id - 1); } else { mmp->mmp_skip_error = error; spa_mmp_history_add(spa, mmp->mmp_ub.ub_txg, gethrestime_sec(), mmp->mmp_delay, NULL, 0, mmp->mmp_kstat_id++, error); zfs_dbgmsg("MMP error choosing leaf pool '%s' " "gethrtime %llu fail_mask %#x", spa_name(spa), gethrtime(), error); } mutex_exit(&mmp->mmp_io_lock); spa_config_exit(spa, SCL_STATE, mmp_tag); return; } vd = spa->spa_mmp.mmp_last_leaf; if (mmp->mmp_skip_error != 0) { mmp->mmp_skip_error = 0; zfs_dbgmsg("MMP write after skipping due to unavailable " "leaves, pool '%s' gethrtime %llu leaf %llu", spa_name(spa), (u_longlong_t)gethrtime(), (u_longlong_t)vd->vdev_guid); } if (mmp->mmp_zio_root == NULL) mmp->mmp_zio_root = zio_root(spa, NULL, NULL, flags | ZIO_FLAG_GODFATHER); if (mmp->mmp_ub.ub_timestamp != gethrestime_sec()) { /* * Want to reset mmp_seq when timestamp advances because after * an mmp_seq wrap new values will not be chosen by * uberblock_compare() as the "best". */ mmp->mmp_ub.ub_timestamp = gethrestime_sec(); mmp->mmp_seq = 1; } ub = &mmp->mmp_ub; ub->ub_mmp_magic = MMP_MAGIC; ub->ub_mmp_delay = mmp->mmp_delay; ub->ub_mmp_config = MMP_SEQ_SET(mmp->mmp_seq) | MMP_INTERVAL_SET(MMP_INTERVAL_OK(zfs_multihost_interval)) | MMP_FAIL_INT_SET(MMP_FAIL_INTVS_OK( zfs_multihost_fail_intervals)); vd->vdev_mmp_pending = gethrtime(); vd->vdev_mmp_kstat_id = mmp->mmp_kstat_id; zio_t *zio = zio_null(mmp->mmp_zio_root, spa, NULL, NULL, NULL, flags); abd_t *ub_abd = abd_alloc_for_io(VDEV_UBERBLOCK_SIZE(vd), B_TRUE); abd_zero(ub_abd, VDEV_UBERBLOCK_SIZE(vd)); abd_copy_from_buf(ub_abd, ub, sizeof (uberblock_t)); mmp->mmp_seq++; mmp->mmp_kstat_id++; mutex_exit(&mmp->mmp_io_lock); offset = VDEV_UBERBLOCK_OFFSET(vd, VDEV_UBERBLOCK_COUNT(vd) - MMP_BLOCKS_PER_LABEL + random_in_range(MMP_BLOCKS_PER_LABEL)); label = random_in_range(VDEV_LABELS); vdev_label_write(zio, vd, label, ub_abd, offset, VDEV_UBERBLOCK_SIZE(vd), mmp_write_done, mmp, flags | ZIO_FLAG_DONT_PROPAGATE); (void) spa_mmp_history_add(spa, ub->ub_txg, ub->ub_timestamp, ub->ub_mmp_delay, vd, label, vd->vdev_mmp_kstat_id, 0); zio_nowait(zio); } static __attribute__((noreturn)) void mmp_thread(void *arg) { spa_t *spa = (spa_t *)arg; mmp_thread_t *mmp = &spa->spa_mmp; boolean_t suspended = spa_suspended(spa); boolean_t multihost = spa_multihost(spa); uint64_t mmp_interval = MSEC2NSEC(MMP_INTERVAL_OK( zfs_multihost_interval)); uint32_t mmp_fail_intervals = MMP_FAIL_INTVS_OK( zfs_multihost_fail_intervals); hrtime_t mmp_fail_ns = mmp_fail_intervals * mmp_interval; - boolean_t last_spa_suspended = suspended; - boolean_t last_spa_multihost = multihost; - uint64_t last_mmp_interval = mmp_interval; - uint32_t last_mmp_fail_intervals = mmp_fail_intervals; - hrtime_t last_mmp_fail_ns = mmp_fail_ns; + boolean_t last_spa_suspended; + boolean_t last_spa_multihost; + uint64_t last_mmp_interval; + uint32_t last_mmp_fail_intervals; + hrtime_t last_mmp_fail_ns; callb_cpr_t cpr; int skip_wait = 0; mmp_thread_enter(mmp, &cpr); /* * There have been no MMP writes yet. Setting mmp_last_write here gives * us one mmp_fail_ns period, which is consistent with the activity * check duration, to try to land an MMP write before MMP suspends the * pool (if so configured). */ mutex_enter(&mmp->mmp_io_lock); mmp->mmp_last_write = gethrtime(); mmp->mmp_delay = MSEC2NSEC(MMP_INTERVAL_OK(zfs_multihost_interval)); mutex_exit(&mmp->mmp_io_lock); while (!mmp->mmp_thread_exiting) { hrtime_t next_time = gethrtime() + MSEC2NSEC(MMP_DEFAULT_INTERVAL); int leaves = MAX(vdev_count_leaves(spa), 1); /* Detect changes in tunables or state */ last_spa_suspended = suspended; last_spa_multihost = multihost; suspended = spa_suspended(spa); multihost = spa_multihost(spa); last_mmp_interval = mmp_interval; last_mmp_fail_intervals = mmp_fail_intervals; last_mmp_fail_ns = mmp_fail_ns; mmp_interval = MSEC2NSEC(MMP_INTERVAL_OK( zfs_multihost_interval)); mmp_fail_intervals = MMP_FAIL_INTVS_OK( zfs_multihost_fail_intervals); /* Smooth so pool is not suspended when reducing tunables */ if (mmp_fail_intervals * mmp_interval < mmp_fail_ns) { mmp_fail_ns = (mmp_fail_ns * 31 + mmp_fail_intervals * mmp_interval) / 32; } else { mmp_fail_ns = mmp_fail_intervals * mmp_interval; } if (mmp_interval != last_mmp_interval || mmp_fail_intervals != last_mmp_fail_intervals) { /* * We want other hosts to see new tunables as quickly as * possible. Write out at higher frequency than usual. */ skip_wait += leaves; } if (multihost) next_time = gethrtime() + mmp_interval / leaves; if (mmp_fail_ns != last_mmp_fail_ns) { zfs_dbgmsg("MMP interval change pool '%s' " "gethrtime %llu last_mmp_interval %llu " "mmp_interval %llu last_mmp_fail_intervals %u " "mmp_fail_intervals %u mmp_fail_ns %llu " "skip_wait %d leaves %d next_time %llu", spa_name(spa), (u_longlong_t)gethrtime(), (u_longlong_t)last_mmp_interval, (u_longlong_t)mmp_interval, last_mmp_fail_intervals, mmp_fail_intervals, (u_longlong_t)mmp_fail_ns, skip_wait, leaves, (u_longlong_t)next_time); } /* * MMP off => on, or suspended => !suspended: * No writes occurred recently. Update mmp_last_write to give * us some time to try. */ if ((!last_spa_multihost && multihost) || (last_spa_suspended && !suspended)) { zfs_dbgmsg("MMP state change pool '%s': gethrtime %llu " "last_spa_multihost %u multihost %u " "last_spa_suspended %u suspended %u", spa_name(spa), (u_longlong_t)gethrtime(), last_spa_multihost, multihost, last_spa_suspended, suspended); mutex_enter(&mmp->mmp_io_lock); mmp->mmp_last_write = gethrtime(); mmp->mmp_delay = mmp_interval; mutex_exit(&mmp->mmp_io_lock); } /* * MMP on => off: * mmp_delay == 0 tells importing node to skip activity check. */ if (last_spa_multihost && !multihost) { mutex_enter(&mmp->mmp_io_lock); mmp->mmp_delay = 0; mutex_exit(&mmp->mmp_io_lock); } /* * Suspend the pool if no MMP write has succeeded in over * mmp_interval * mmp_fail_intervals nanoseconds. */ if (multihost && !suspended && mmp_fail_intervals && (gethrtime() - mmp->mmp_last_write) > mmp_fail_ns) { zfs_dbgmsg("MMP suspending pool '%s': gethrtime %llu " "mmp_last_write %llu mmp_interval %llu " "mmp_fail_intervals %llu mmp_fail_ns %llu", spa_name(spa), (u_longlong_t)gethrtime(), (u_longlong_t)mmp->mmp_last_write, (u_longlong_t)mmp_interval, (u_longlong_t)mmp_fail_intervals, (u_longlong_t)mmp_fail_ns); cmn_err(CE_WARN, "MMP writes to pool '%s' have not " "succeeded in over %llu ms; suspending pool. " "Hrtime %llu", spa_name(spa), NSEC2MSEC(gethrtime() - mmp->mmp_last_write), gethrtime()); zio_suspend(spa, NULL, ZIO_SUSPEND_MMP); } if (multihost && !suspended) mmp_write_uberblock(spa); if (skip_wait > 0) { next_time = gethrtime() + MSEC2NSEC(MMP_MIN_INTERVAL) / leaves; skip_wait--; } CALLB_CPR_SAFE_BEGIN(&cpr); (void) cv_timedwait_idle_hires(&mmp->mmp_thread_cv, &mmp->mmp_thread_lock, next_time, USEC2NSEC(100), CALLOUT_FLAG_ABSOLUTE); CALLB_CPR_SAFE_END(&cpr, &mmp->mmp_thread_lock); } /* Outstanding writes are allowed to complete. */ zio_wait(mmp->mmp_zio_root); mmp->mmp_zio_root = NULL; mmp_thread_exit(mmp, &mmp->mmp_thread, &cpr); thread_exit(); } /* * Signal the MMP thread to wake it, when it is sleeping on * its cv. Used when some module parameter has changed and * we want the thread to know about it. * Only signal if the pool is active and mmp thread is * running, otherwise there is no thread to wake. */ static void mmp_signal_thread(spa_t *spa) { mmp_thread_t *mmp = &spa->spa_mmp; mutex_enter(&mmp->mmp_thread_lock); if (mmp->mmp_thread) cv_broadcast(&mmp->mmp_thread_cv); mutex_exit(&mmp->mmp_thread_lock); } void mmp_signal_all_threads(void) { spa_t *spa = NULL; mutex_enter(&spa_namespace_lock); while ((spa = spa_next(spa))) { if (spa->spa_state == POOL_STATE_ACTIVE) mmp_signal_thread(spa); } mutex_exit(&spa_namespace_lock); } /* BEGIN CSTYLED */ ZFS_MODULE_PARAM_CALL(zfs_multihost, zfs_multihost_, interval, param_set_multihost_interval, spl_param_get_u64, ZMOD_RW, "Milliseconds between mmp writes to each leaf"); /* END CSTYLED */ ZFS_MODULE_PARAM(zfs_multihost, zfs_multihost_, fail_intervals, UINT, ZMOD_RW, "Max allowed period without a successful mmp write"); ZFS_MODULE_PARAM(zfs_multihost, zfs_multihost_, import_intervals, UINT, ZMOD_RW, "Number of zfs_multihost_interval periods to wait for activity"); diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 26df0290c9ff..0a9f31a8fc85 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -1,10043 +1,10043 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright (c) 2018, Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2013 Saso Kiselkov. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2016 Toomas Soome * Copyright (c) 2016 Actifio, Inc. All rights reserved. * Copyright 2018 Joyent, Inc. * Copyright (c) 2017, 2019, Datto Inc. All rights reserved. * Copyright 2017 Joyent, Inc. * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2021, Colm Buckley */ /* * SPA: Storage Pool Allocator * * This file contains all the routines used when modifying on-disk SPA state. * This includes opening, importing, destroying, exporting a pool, and syncing a * pool. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef _KERNEL #include #include #include #include #include #endif /* _KERNEL */ #include "zfs_prop.h" #include "zfs_comutil.h" /* * The interval, in seconds, at which failed configuration cache file writes * should be retried. */ int zfs_ccw_retry_interval = 300; typedef enum zti_modes { ZTI_MODE_FIXED, /* value is # of threads (min 1) */ ZTI_MODE_BATCH, /* cpu-intensive; value is ignored */ ZTI_MODE_SCALE, /* Taskqs scale with CPUs. */ ZTI_MODE_NULL, /* don't create a taskq */ ZTI_NMODES } zti_modes_t; #define ZTI_P(n, q) { ZTI_MODE_FIXED, (n), (q) } #define ZTI_PCT(n) { ZTI_MODE_ONLINE_PERCENT, (n), 1 } #define ZTI_BATCH { ZTI_MODE_BATCH, 0, 1 } #define ZTI_SCALE { ZTI_MODE_SCALE, 0, 1 } #define ZTI_NULL { ZTI_MODE_NULL, 0, 0 } #define ZTI_N(n) ZTI_P(n, 1) #define ZTI_ONE ZTI_N(1) typedef struct zio_taskq_info { zti_modes_t zti_mode; uint_t zti_value; uint_t zti_count; } zio_taskq_info_t; static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { "iss", "iss_h", "int", "int_h" }; /* * This table defines the taskq settings for each ZFS I/O type. When * initializing a pool, we use this table to create an appropriately sized * taskq. Some operations are low volume and therefore have a small, static * number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE * macros. Other operations process a large amount of data; the ZTI_BATCH * macro causes us to create a taskq oriented for throughput. Some operations * are so high frequency and short-lived that the taskq itself can become a * point of lock contention. The ZTI_P(#, #) macro indicates that we need an * additional degree of parallelism specified by the number of threads per- * taskq and the number of taskqs; when dispatching an event in this case, the * particular taskq is chosen at random. ZTI_SCALE is similar to ZTI_BATCH, * but with number of taskqs also scaling with number of CPUs. * * The different taskq priorities are to handle the different contexts (issue * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that * need to be handled with minimum delay. */ static const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { /* ISSUE ISSUE_HIGH INTR INTR_HIGH */ { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* NULL */ { ZTI_N(8), ZTI_NULL, ZTI_SCALE, ZTI_NULL }, /* READ */ { ZTI_BATCH, ZTI_N(5), ZTI_SCALE, ZTI_N(5) }, /* WRITE */ { ZTI_SCALE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FREE */ { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* CLAIM */ { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* IOCTL */ { ZTI_N(4), ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* TRIM */ }; static void spa_sync_version(void *arg, dmu_tx_t *tx); static void spa_sync_props(void *arg, dmu_tx_t *tx); static boolean_t spa_has_active_shared_spare(spa_t *spa); static int spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport); static void spa_vdev_resilver_done(spa_t *spa); static uint_t zio_taskq_batch_pct = 80; /* 1 thread per cpu in pset */ static uint_t zio_taskq_batch_tpq; /* threads per taskq */ static const boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */ static const uint_t zio_taskq_basedc = 80; /* base duty cycle */ static const boolean_t spa_create_process = B_TRUE; /* no process => no sysdc */ /* * Report any spa_load_verify errors found, but do not fail spa_load. * This is used by zdb to analyze non-idle pools. */ boolean_t spa_load_verify_dryrun = B_FALSE; /* * Allow read spacemaps in case of readonly import (spa_mode == SPA_MODE_READ). * This is used by zdb for spacemaps verification. */ boolean_t spa_mode_readable_spacemaps = B_FALSE; /* * This (illegal) pool name is used when temporarily importing a spa_t in order * to get the vdev stats associated with the imported devices. */ #define TRYIMPORT_NAME "$import" /* * For debugging purposes: print out vdev tree during pool import. */ static int spa_load_print_vdev_tree = B_FALSE; /* * A non-zero value for zfs_max_missing_tvds means that we allow importing * pools with missing top-level vdevs. This is strictly intended for advanced * pool recovery cases since missing data is almost inevitable. Pools with * missing devices can only be imported read-only for safety reasons, and their * fail-mode will be automatically set to "continue". * * With 1 missing vdev we should be able to import the pool and mount all * datasets. User data that was not modified after the missing device has been * added should be recoverable. This means that snapshots created prior to the * addition of that device should be completely intact. * * With 2 missing vdevs, some datasets may fail to mount since there are * dataset statistics that are stored as regular metadata. Some data might be * recoverable if those vdevs were added recently. * * With 3 or more missing vdevs, the pool is severely damaged and MOS entries * may be missing entirely. Chances of data recovery are very low. Note that * there are also risks of performing an inadvertent rewind as we might be * missing all the vdevs with the latest uberblocks. */ uint64_t zfs_max_missing_tvds = 0; /* * The parameters below are similar to zfs_max_missing_tvds but are only * intended for a preliminary open of the pool with an untrusted config which * might be incomplete or out-dated. * * We are more tolerant for pools opened from a cachefile since we could have * an out-dated cachefile where a device removal was not registered. * We could have set the limit arbitrarily high but in the case where devices * are really missing we would want to return the proper error codes; we chose * SPA_DVAS_PER_BP - 1 so that some copies of the MOS would still be available * and we get a chance to retrieve the trusted config. */ uint64_t zfs_max_missing_tvds_cachefile = SPA_DVAS_PER_BP - 1; /* * In the case where config was assembled by scanning device paths (/dev/dsks * by default) we are less tolerant since all the existing devices should have * been detected and we want spa_load to return the right error codes. */ uint64_t zfs_max_missing_tvds_scan = 0; /* * Debugging aid that pauses spa_sync() towards the end. */ static const boolean_t zfs_pause_spa_sync = B_FALSE; /* * Variables to indicate the livelist condense zthr func should wait at certain * points for the livelist to be removed - used to test condense/destroy races */ static int zfs_livelist_condense_zthr_pause = 0; static int zfs_livelist_condense_sync_pause = 0; /* * Variables to track whether or not condense cancellation has been * triggered in testing. */ static int zfs_livelist_condense_sync_cancel = 0; static int zfs_livelist_condense_zthr_cancel = 0; /* * Variable to track whether or not extra ALLOC blkptrs were added to a * livelist entry while it was being condensed (caused by the way we track * remapped blkptrs in dbuf_remap_impl) */ static int zfs_livelist_condense_new_alloc = 0; /* * ========================================================================== * SPA properties routines * ========================================================================== */ /* * Add a (source=src, propname=propval) list to an nvlist. */ static void spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, const char *strval, uint64_t intval, zprop_source_t src) { const char *propname = zpool_prop_to_name(prop); nvlist_t *propval; propval = fnvlist_alloc(); fnvlist_add_uint64(propval, ZPROP_SOURCE, src); if (strval != NULL) fnvlist_add_string(propval, ZPROP_VALUE, strval); else fnvlist_add_uint64(propval, ZPROP_VALUE, intval); fnvlist_add_nvlist(nvl, propname, propval); nvlist_free(propval); } /* * Get property values from the spa configuration. */ static void spa_prop_get_config(spa_t *spa, nvlist_t **nvp) { vdev_t *rvd = spa->spa_root_vdev; dsl_pool_t *pool = spa->spa_dsl_pool; uint64_t size, alloc, cap, version; const zprop_source_t src = ZPROP_SRC_NONE; spa_config_dirent_t *dp; metaslab_class_t *mc = spa_normal_class(spa); ASSERT(MUTEX_HELD(&spa->spa_props_lock)); if (rvd != NULL) { alloc = metaslab_class_get_alloc(mc); alloc += metaslab_class_get_alloc(spa_special_class(spa)); alloc += metaslab_class_get_alloc(spa_dedup_class(spa)); alloc += metaslab_class_get_alloc(spa_embedded_log_class(spa)); size = metaslab_class_get_space(mc); size += metaslab_class_get_space(spa_special_class(spa)); size += metaslab_class_get_space(spa_dedup_class(spa)); size += metaslab_class_get_space(spa_embedded_log_class(spa)); spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src); spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src); spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src); spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL, size - alloc, src); spa_prop_add_list(*nvp, ZPOOL_PROP_CHECKPOINT, NULL, spa->spa_checkpoint_info.sci_dspace, src); spa_prop_add_list(*nvp, ZPOOL_PROP_FRAGMENTATION, NULL, metaslab_class_fragmentation(mc), src); spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL, metaslab_class_expandable_space(mc), src); spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL, (spa_mode(spa) == SPA_MODE_READ), src); cap = (size == 0) ? 0 : (alloc * 100 / size); spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src); spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL, ddt_get_pool_dedup_ratio(spa), src); spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, rvd->vdev_state, src); version = spa_version(spa); if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) { spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, ZPROP_SRC_DEFAULT); } else { spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, ZPROP_SRC_LOCAL); } spa_prop_add_list(*nvp, ZPOOL_PROP_LOAD_GUID, NULL, spa_load_guid(spa), src); } if (pool != NULL) { /* * The $FREE directory was introduced in SPA_VERSION_DEADLISTS, * when opening pools before this version freedir will be NULL. */ if (pool->dp_free_dir != NULL) { spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL, dsl_dir_phys(pool->dp_free_dir)->dd_used_bytes, src); } else { spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL, 0, src); } if (pool->dp_leak_dir != NULL) { spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, NULL, dsl_dir_phys(pool->dp_leak_dir)->dd_used_bytes, src); } else { spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, NULL, 0, src); } } spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); if (spa->spa_comment != NULL) { spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment, 0, ZPROP_SRC_LOCAL); } if (spa->spa_compatibility != NULL) { spa_prop_add_list(*nvp, ZPOOL_PROP_COMPATIBILITY, spa->spa_compatibility, 0, ZPROP_SRC_LOCAL); } if (spa->spa_root != NULL) spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root, 0, ZPROP_SRC_LOCAL); if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) { spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL, MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE), ZPROP_SRC_NONE); } else { spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL, SPA_OLD_MAXBLOCKSIZE, ZPROP_SRC_NONE); } if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE)) { spa_prop_add_list(*nvp, ZPOOL_PROP_MAXDNODESIZE, NULL, DNODE_MAX_SIZE, ZPROP_SRC_NONE); } else { spa_prop_add_list(*nvp, ZPOOL_PROP_MAXDNODESIZE, NULL, DNODE_MIN_SIZE, ZPROP_SRC_NONE); } if ((dp = list_head(&spa->spa_config_list)) != NULL) { if (dp->scd_path == NULL) { spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, "none", 0, ZPROP_SRC_LOCAL); } else if (strcmp(dp->scd_path, spa_config_path) != 0) { spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, dp->scd_path, 0, ZPROP_SRC_LOCAL); } } } /* * Get zpool property values. */ int spa_prop_get(spa_t *spa, nvlist_t **nvp) { objset_t *mos = spa->spa_meta_objset; zap_cursor_t zc; zap_attribute_t za; dsl_pool_t *dp; int err; err = nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP); if (err) return (err); dp = spa_get_dsl(spa); dsl_pool_config_enter(dp, FTAG); mutex_enter(&spa->spa_props_lock); /* * Get properties from the spa config. */ spa_prop_get_config(spa, nvp); /* If no pool property object, no more prop to get. */ if (mos == NULL || spa->spa_pool_props_object == 0) goto out; /* * Get properties from the MOS pool property object. */ for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); (err = zap_cursor_retrieve(&zc, &za)) == 0; zap_cursor_advance(&zc)) { uint64_t intval = 0; char *strval = NULL; zprop_source_t src = ZPROP_SRC_DEFAULT; zpool_prop_t prop; if ((prop = zpool_name_to_prop(za.za_name)) == ZPOOL_PROP_INVAL) continue; switch (za.za_integer_length) { case 8: /* integer property */ if (za.za_first_integer != zpool_prop_default_numeric(prop)) src = ZPROP_SRC_LOCAL; if (prop == ZPOOL_PROP_BOOTFS) { dsl_dataset_t *ds = NULL; err = dsl_dataset_hold_obj(dp, za.za_first_integer, FTAG, &ds); if (err != 0) break; strval = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); dsl_dataset_name(ds, strval); dsl_dataset_rele(ds, FTAG); } else { strval = NULL; intval = za.za_first_integer; } spa_prop_add_list(*nvp, prop, strval, intval, src); if (strval != NULL) kmem_free(strval, ZFS_MAX_DATASET_NAME_LEN); break; case 1: /* string property */ strval = kmem_alloc(za.za_num_integers, KM_SLEEP); err = zap_lookup(mos, spa->spa_pool_props_object, za.za_name, 1, za.za_num_integers, strval); if (err) { kmem_free(strval, za.za_num_integers); break; } spa_prop_add_list(*nvp, prop, strval, 0, src); kmem_free(strval, za.za_num_integers); break; default: break; } } zap_cursor_fini(&zc); out: mutex_exit(&spa->spa_props_lock); dsl_pool_config_exit(dp, FTAG); if (err && err != ENOENT) { nvlist_free(*nvp); *nvp = NULL; return (err); } return (0); } /* * Validate the given pool properties nvlist and modify the list * for the property values to be set. */ static int spa_prop_validate(spa_t *spa, nvlist_t *props) { nvpair_t *elem; int error = 0, reset_bootfs = 0; uint64_t objnum = 0; boolean_t has_feature = B_FALSE; elem = NULL; while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { uint64_t intval; char *strval, *slash, *check, *fname; const char *propname = nvpair_name(elem); zpool_prop_t prop = zpool_name_to_prop(propname); switch (prop) { case ZPOOL_PROP_INVAL: if (!zpool_prop_feature(propname)) { error = SET_ERROR(EINVAL); break; } /* * Sanitize the input. */ if (nvpair_type(elem) != DATA_TYPE_UINT64) { error = SET_ERROR(EINVAL); break; } if (nvpair_value_uint64(elem, &intval) != 0) { error = SET_ERROR(EINVAL); break; } if (intval != 0) { error = SET_ERROR(EINVAL); break; } fname = strchr(propname, '@') + 1; if (zfeature_lookup_name(fname, NULL) != 0) { error = SET_ERROR(EINVAL); break; } has_feature = B_TRUE; break; case ZPOOL_PROP_VERSION: error = nvpair_value_uint64(elem, &intval); if (!error && (intval < spa_version(spa) || intval > SPA_VERSION_BEFORE_FEATURES || has_feature)) error = SET_ERROR(EINVAL); break; case ZPOOL_PROP_DELEGATION: case ZPOOL_PROP_AUTOREPLACE: case ZPOOL_PROP_LISTSNAPS: case ZPOOL_PROP_AUTOEXPAND: case ZPOOL_PROP_AUTOTRIM: error = nvpair_value_uint64(elem, &intval); if (!error && intval > 1) error = SET_ERROR(EINVAL); break; case ZPOOL_PROP_MULTIHOST: error = nvpair_value_uint64(elem, &intval); if (!error && intval > 1) error = SET_ERROR(EINVAL); if (!error) { uint32_t hostid = zone_get_hostid(NULL); if (hostid) spa->spa_hostid = hostid; else error = SET_ERROR(ENOTSUP); } break; case ZPOOL_PROP_BOOTFS: /* * If the pool version is less than SPA_VERSION_BOOTFS, * or the pool is still being created (version == 0), * the bootfs property cannot be set. */ if (spa_version(spa) < SPA_VERSION_BOOTFS) { error = SET_ERROR(ENOTSUP); break; } /* * Make sure the vdev config is bootable */ if (!vdev_is_bootable(spa->spa_root_vdev)) { error = SET_ERROR(ENOTSUP); break; } reset_bootfs = 1; error = nvpair_value_string(elem, &strval); if (!error) { objset_t *os; if (strval == NULL || strval[0] == '\0') { objnum = zpool_prop_default_numeric( ZPOOL_PROP_BOOTFS); break; } error = dmu_objset_hold(strval, FTAG, &os); if (error != 0) break; /* Must be ZPL. */ if (dmu_objset_type(os) != DMU_OST_ZFS) { error = SET_ERROR(ENOTSUP); } else { objnum = dmu_objset_id(os); } dmu_objset_rele(os, FTAG); } break; case ZPOOL_PROP_FAILUREMODE: error = nvpair_value_uint64(elem, &intval); if (!error && intval > ZIO_FAILURE_MODE_PANIC) error = SET_ERROR(EINVAL); /* * This is a special case which only occurs when * the pool has completely failed. This allows * the user to change the in-core failmode property * without syncing it out to disk (I/Os might * currently be blocked). We do this by returning * EIO to the caller (spa_prop_set) to trick it * into thinking we encountered a property validation * error. */ if (!error && spa_suspended(spa)) { spa->spa_failmode = intval; error = SET_ERROR(EIO); } break; case ZPOOL_PROP_CACHEFILE: if ((error = nvpair_value_string(elem, &strval)) != 0) break; if (strval[0] == '\0') break; if (strcmp(strval, "none") == 0) break; if (strval[0] != '/') { error = SET_ERROR(EINVAL); break; } slash = strrchr(strval, '/'); ASSERT(slash != NULL); if (slash[1] == '\0' || strcmp(slash, "/.") == 0 || strcmp(slash, "/..") == 0) error = SET_ERROR(EINVAL); break; case ZPOOL_PROP_COMMENT: if ((error = nvpair_value_string(elem, &strval)) != 0) break; for (check = strval; *check != '\0'; check++) { if (!isprint(*check)) { error = SET_ERROR(EINVAL); break; } } if (strlen(strval) > ZPROP_MAX_COMMENT) error = SET_ERROR(E2BIG); break; default: break; } if (error) break; } (void) nvlist_remove_all(props, zpool_prop_to_name(ZPOOL_PROP_DEDUPDITTO)); if (!error && reset_bootfs) { error = nvlist_remove(props, zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING); if (!error) { error = nvlist_add_uint64(props, zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum); } } return (error); } void spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync) { char *cachefile; spa_config_dirent_t *dp; if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), &cachefile) != 0) return; dp = kmem_alloc(sizeof (spa_config_dirent_t), KM_SLEEP); if (cachefile[0] == '\0') dp->scd_path = spa_strdup(spa_config_path); else if (strcmp(cachefile, "none") == 0) dp->scd_path = NULL; else dp->scd_path = spa_strdup(cachefile); list_insert_head(&spa->spa_config_list, dp); if (need_sync) spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); } int spa_prop_set(spa_t *spa, nvlist_t *nvp) { int error; nvpair_t *elem = NULL; boolean_t need_sync = B_FALSE; if ((error = spa_prop_validate(spa, nvp)) != 0) return (error); while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) { zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem)); if (prop == ZPOOL_PROP_CACHEFILE || prop == ZPOOL_PROP_ALTROOT || prop == ZPOOL_PROP_READONLY) continue; if (prop == ZPOOL_PROP_VERSION || prop == ZPOOL_PROP_INVAL) { uint64_t ver = 0; if (prop == ZPOOL_PROP_VERSION) { VERIFY(nvpair_value_uint64(elem, &ver) == 0); } else { ASSERT(zpool_prop_feature(nvpair_name(elem))); ver = SPA_VERSION_FEATURES; need_sync = B_TRUE; } /* Save time if the version is already set. */ if (ver == spa_version(spa)) continue; /* * In addition to the pool directory object, we might * create the pool properties object, the features for * read object, the features for write object, or the * feature descriptions object. */ error = dsl_sync_task(spa->spa_name, NULL, spa_sync_version, &ver, 6, ZFS_SPACE_CHECK_RESERVED); if (error) return (error); continue; } need_sync = B_TRUE; break; } if (need_sync) { return (dsl_sync_task(spa->spa_name, NULL, spa_sync_props, nvp, 6, ZFS_SPACE_CHECK_RESERVED)); } return (0); } /* * If the bootfs property value is dsobj, clear it. */ void spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) { if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { VERIFY(zap_remove(spa->spa_meta_objset, spa->spa_pool_props_object, zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0); spa->spa_bootfs = 0; } } static int spa_change_guid_check(void *arg, dmu_tx_t *tx) { uint64_t *newguid __maybe_unused = arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; vdev_t *rvd = spa->spa_root_vdev; uint64_t vdev_state; if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { int error = (spa_has_checkpoint(spa)) ? ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; return (SET_ERROR(error)); } spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); vdev_state = rvd->vdev_state; spa_config_exit(spa, SCL_STATE, FTAG); if (vdev_state != VDEV_STATE_HEALTHY) return (SET_ERROR(ENXIO)); ASSERT3U(spa_guid(spa), !=, *newguid); return (0); } static void spa_change_guid_sync(void *arg, dmu_tx_t *tx) { uint64_t *newguid = arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; uint64_t oldguid; vdev_t *rvd = spa->spa_root_vdev; oldguid = spa_guid(spa); spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); rvd->vdev_guid = *newguid; rvd->vdev_guid_sum += (*newguid - oldguid); vdev_config_dirty(rvd); spa_config_exit(spa, SCL_STATE, FTAG); spa_history_log_internal(spa, "guid change", tx, "old=%llu new=%llu", (u_longlong_t)oldguid, (u_longlong_t)*newguid); } /* * Change the GUID for the pool. This is done so that we can later * re-import a pool built from a clone of our own vdevs. We will modify * the root vdev's guid, our own pool guid, and then mark all of our * vdevs dirty. Note that we must make sure that all our vdevs are * online when we do this, or else any vdevs that weren't present * would be orphaned from our pool. We are also going to issue a * sysevent to update any watchers. */ int spa_change_guid(spa_t *spa) { int error; uint64_t guid; mutex_enter(&spa->spa_vdev_top_lock); mutex_enter(&spa_namespace_lock); guid = spa_generate_guid(NULL); error = dsl_sync_task(spa->spa_name, spa_change_guid_check, spa_change_guid_sync, &guid, 5, ZFS_SPACE_CHECK_RESERVED); if (error == 0) { /* * Clear the kobj flag from all the vdevs to allow * vdev_cache_process_kobj_evt() to post events to all the * vdevs since GUID is updated. */ vdev_clear_kobj_evt(spa->spa_root_vdev); for (int i = 0; i < spa->spa_l2cache.sav_count; i++) vdev_clear_kobj_evt(spa->spa_l2cache.sav_vdevs[i]); spa_write_cachefile(spa, B_FALSE, B_TRUE, B_TRUE); spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_REGUID); } mutex_exit(&spa_namespace_lock); mutex_exit(&spa->spa_vdev_top_lock); return (error); } /* * ========================================================================== * SPA state manipulation (open/create/destroy/import/export) * ========================================================================== */ static int spa_error_entry_compare(const void *a, const void *b) { const spa_error_entry_t *sa = (const spa_error_entry_t *)a; const spa_error_entry_t *sb = (const spa_error_entry_t *)b; int ret; ret = memcmp(&sa->se_bookmark, &sb->se_bookmark, sizeof (zbookmark_phys_t)); return (TREE_ISIGN(ret)); } /* * Utility function which retrieves copies of the current logs and * re-initializes them in the process. */ void spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) { ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); memcpy(last, &spa->spa_errlist_last, sizeof (avl_tree_t)); memcpy(scrub, &spa->spa_errlist_scrub, sizeof (avl_tree_t)); avl_create(&spa->spa_errlist_scrub, spa_error_entry_compare, sizeof (spa_error_entry_t), offsetof(spa_error_entry_t, se_avl)); avl_create(&spa->spa_errlist_last, spa_error_entry_compare, sizeof (spa_error_entry_t), offsetof(spa_error_entry_t, se_avl)); } static void spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q) { const zio_taskq_info_t *ztip = &zio_taskqs[t][q]; enum zti_modes mode = ztip->zti_mode; uint_t value = ztip->zti_value; uint_t count = ztip->zti_count; spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; uint_t cpus, flags = TASKQ_DYNAMIC; boolean_t batch = B_FALSE; switch (mode) { case ZTI_MODE_FIXED: ASSERT3U(value, >, 0); break; case ZTI_MODE_BATCH: batch = B_TRUE; flags |= TASKQ_THREADS_CPU_PCT; value = MIN(zio_taskq_batch_pct, 100); break; case ZTI_MODE_SCALE: flags |= TASKQ_THREADS_CPU_PCT; /* * We want more taskqs to reduce lock contention, but we want * less for better request ordering and CPU utilization. */ cpus = MAX(1, boot_ncpus * zio_taskq_batch_pct / 100); if (zio_taskq_batch_tpq > 0) { count = MAX(1, (cpus + zio_taskq_batch_tpq / 2) / zio_taskq_batch_tpq); } else { /* * Prefer 6 threads per taskq, but no more taskqs * than threads in them on large systems. For 80%: * * taskq taskq total * cpus taskqs percent threads threads * ------- ------- ------- ------- ------- * 1 1 80% 1 1 * 2 1 80% 1 1 * 4 1 80% 3 3 * 8 2 40% 3 6 * 16 3 27% 4 12 * 32 5 16% 5 25 * 64 7 11% 7 49 * 128 10 8% 10 100 * 256 14 6% 15 210 */ count = 1 + cpus / 6; while (count * count > cpus) count--; } /* Limit each taskq within 100% to not trigger assertion. */ count = MAX(count, (zio_taskq_batch_pct + 99) / 100); value = (zio_taskq_batch_pct + count / 2) / count; break; case ZTI_MODE_NULL: tqs->stqs_count = 0; tqs->stqs_taskq = NULL; return; default: panic("unrecognized mode for %s_%s taskq (%u:%u) in " "spa_activate()", zio_type_name[t], zio_taskq_types[q], mode, value); break; } ASSERT3U(count, >, 0); tqs->stqs_count = count; tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP); for (uint_t i = 0; i < count; i++) { taskq_t *tq; char name[32]; if (count > 1) (void) snprintf(name, sizeof (name), "%s_%s_%u", zio_type_name[t], zio_taskq_types[q], i); else (void) snprintf(name, sizeof (name), "%s_%s", zio_type_name[t], zio_taskq_types[q]); if (zio_taskq_sysdc && spa->spa_proc != &p0) { if (batch) flags |= TASKQ_DC_BATCH; (void) zio_taskq_basedc; tq = taskq_create_sysdc(name, value, 50, INT_MAX, spa->spa_proc, zio_taskq_basedc, flags); } else { pri_t pri = maxclsyspri; /* * The write issue taskq can be extremely CPU * intensive. Run it at slightly less important * priority than the other taskqs. * * Under Linux and FreeBSD this means incrementing * the priority value as opposed to platforms like * illumos where it should be decremented. * * On FreeBSD, if priorities divided by four (RQ_PPQ) * are equal then a difference between them is * insignificant. */ if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE) { #if defined(__linux__) pri++; #elif defined(__FreeBSD__) pri += 4; #else #error "unknown OS" #endif } tq = taskq_create_proc(name, value, pri, 50, INT_MAX, spa->spa_proc, flags); } tqs->stqs_taskq[i] = tq; } } static void spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q) { spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; if (tqs->stqs_taskq == NULL) { ASSERT3U(tqs->stqs_count, ==, 0); return; } for (uint_t i = 0; i < tqs->stqs_count; i++) { ASSERT3P(tqs->stqs_taskq[i], !=, NULL); taskq_destroy(tqs->stqs_taskq[i]); } kmem_free(tqs->stqs_taskq, tqs->stqs_count * sizeof (taskq_t *)); tqs->stqs_taskq = NULL; } /* * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority. * Note that a type may have multiple discrete taskqs to avoid lock contention * on the taskq itself. In that case we choose which taskq at random by using * the low bits of gethrtime(). */ void spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q, task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent) { spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; taskq_t *tq; ASSERT3P(tqs->stqs_taskq, !=, NULL); ASSERT3U(tqs->stqs_count, !=, 0); if (tqs->stqs_count == 1) { tq = tqs->stqs_taskq[0]; } else { tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count]; } taskq_dispatch_ent(tq, func, arg, flags, ent); } /* * Same as spa_taskq_dispatch_ent() but block on the task until completion. */ void spa_taskq_dispatch_sync(spa_t *spa, zio_type_t t, zio_taskq_type_t q, task_func_t *func, void *arg, uint_t flags) { spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; taskq_t *tq; taskqid_t id; ASSERT3P(tqs->stqs_taskq, !=, NULL); ASSERT3U(tqs->stqs_count, !=, 0); if (tqs->stqs_count == 1) { tq = tqs->stqs_taskq[0]; } else { tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count]; } id = taskq_dispatch(tq, func, arg, flags); if (id) taskq_wait_id(tq, id); } static void spa_create_zio_taskqs(spa_t *spa) { for (int t = 0; t < ZIO_TYPES; t++) { for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { spa_taskqs_init(spa, t, q); } } } /* * Disabled until spa_thread() can be adapted for Linux. */ #undef HAVE_SPA_THREAD #if defined(_KERNEL) && defined(HAVE_SPA_THREAD) static void spa_thread(void *arg) { psetid_t zio_taskq_psrset_bind = PS_NONE; callb_cpr_t cprinfo; spa_t *spa = arg; user_t *pu = PTOU(curproc); CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr, spa->spa_name); ASSERT(curproc != &p0); (void) snprintf(pu->u_psargs, sizeof (pu->u_psargs), "zpool-%s", spa->spa_name); (void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm)); /* bind this thread to the requested psrset */ if (zio_taskq_psrset_bind != PS_NONE) { pool_lock(); mutex_enter(&cpu_lock); mutex_enter(&pidlock); mutex_enter(&curproc->p_lock); if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind, 0, NULL, NULL) == 0) { curthread->t_bind_pset = zio_taskq_psrset_bind; } else { cmn_err(CE_WARN, "Couldn't bind process for zfs pool \"%s\" to " "pset %d\n", spa->spa_name, zio_taskq_psrset_bind); } mutex_exit(&curproc->p_lock); mutex_exit(&pidlock); mutex_exit(&cpu_lock); pool_unlock(); } if (zio_taskq_sysdc) { sysdc_thread_enter(curthread, 100, 0); } spa->spa_proc = curproc; spa->spa_did = curthread->t_did; spa_create_zio_taskqs(spa); mutex_enter(&spa->spa_proc_lock); ASSERT(spa->spa_proc_state == SPA_PROC_CREATED); spa->spa_proc_state = SPA_PROC_ACTIVE; cv_broadcast(&spa->spa_proc_cv); CALLB_CPR_SAFE_BEGIN(&cprinfo); while (spa->spa_proc_state == SPA_PROC_ACTIVE) cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock); ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE); spa->spa_proc_state = SPA_PROC_GONE; spa->spa_proc = &p0; cv_broadcast(&spa->spa_proc_cv); CALLB_CPR_EXIT(&cprinfo); /* drops spa_proc_lock */ mutex_enter(&curproc->p_lock); lwp_exit(); } #endif /* * Activate an uninitialized pool. */ static void spa_activate(spa_t *spa, spa_mode_t mode) { ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); spa->spa_state = POOL_STATE_ACTIVE; spa->spa_mode = mode; spa->spa_read_spacemaps = spa_mode_readable_spacemaps; spa->spa_normal_class = metaslab_class_create(spa, &zfs_metaslab_ops); spa->spa_log_class = metaslab_class_create(spa, &zfs_metaslab_ops); spa->spa_embedded_log_class = metaslab_class_create(spa, &zfs_metaslab_ops); spa->spa_special_class = metaslab_class_create(spa, &zfs_metaslab_ops); spa->spa_dedup_class = metaslab_class_create(spa, &zfs_metaslab_ops); /* Try to create a covering process */ mutex_enter(&spa->spa_proc_lock); ASSERT(spa->spa_proc_state == SPA_PROC_NONE); ASSERT(spa->spa_proc == &p0); spa->spa_did = 0; (void) spa_create_process; #ifdef HAVE_SPA_THREAD /* Only create a process if we're going to be around a while. */ if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) { if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri, NULL, 0) == 0) { spa->spa_proc_state = SPA_PROC_CREATED; while (spa->spa_proc_state == SPA_PROC_CREATED) { cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); } ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); ASSERT(spa->spa_proc != &p0); ASSERT(spa->spa_did != 0); } else { #ifdef _KERNEL cmn_err(CE_WARN, "Couldn't create process for zfs pool \"%s\"\n", spa->spa_name); #endif } } #endif /* HAVE_SPA_THREAD */ mutex_exit(&spa->spa_proc_lock); /* If we didn't create a process, we need to create our taskqs. */ if (spa->spa_proc == &p0) { spa_create_zio_taskqs(spa); } for (size_t i = 0; i < TXG_SIZE; i++) { spa->spa_txg_zio[i] = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); } list_create(&spa->spa_config_dirty_list, sizeof (vdev_t), offsetof(vdev_t, vdev_config_dirty_node)); list_create(&spa->spa_evicting_os_list, sizeof (objset_t), offsetof(objset_t, os_evicting_node)); list_create(&spa->spa_state_dirty_list, sizeof (vdev_t), offsetof(vdev_t, vdev_state_dirty_node)); txg_list_create(&spa->spa_vdev_txg_list, spa, offsetof(struct vdev, vdev_txg_node)); avl_create(&spa->spa_errlist_scrub, spa_error_entry_compare, sizeof (spa_error_entry_t), offsetof(spa_error_entry_t, se_avl)); avl_create(&spa->spa_errlist_last, spa_error_entry_compare, sizeof (spa_error_entry_t), offsetof(spa_error_entry_t, se_avl)); avl_create(&spa->spa_errlist_healed, spa_error_entry_compare, sizeof (spa_error_entry_t), offsetof(spa_error_entry_t, se_avl)); spa_activate_os(spa); spa_keystore_init(&spa->spa_keystore); /* * This taskq is used to perform zvol-minor-related tasks * asynchronously. This has several advantages, including easy * resolution of various deadlocks. * * The taskq must be single threaded to ensure tasks are always * processed in the order in which they were dispatched. * * A taskq per pool allows one to keep the pools independent. * This way if one pool is suspended, it will not impact another. * * The preferred location to dispatch a zvol minor task is a sync * task. In this context, there is easy access to the spa_t and minimal * error handling is required because the sync task must succeed. */ spa->spa_zvol_taskq = taskq_create("z_zvol", 1, defclsyspri, 1, INT_MAX, 0); /* * Taskq dedicated to prefetcher threads: this is used to prevent the * pool traverse code from monopolizing the global (and limited) * system_taskq by inappropriately scheduling long running tasks on it. */ spa->spa_prefetch_taskq = taskq_create("z_prefetch", 100, defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT); /* * The taskq to upgrade datasets in this pool. Currently used by * feature SPA_FEATURE_USEROBJ_ACCOUNTING/SPA_FEATURE_PROJECT_QUOTA. */ spa->spa_upgrade_taskq = taskq_create("z_upgrade", 100, defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT); } /* * Opposite of spa_activate(). */ static void spa_deactivate(spa_t *spa) { ASSERT(spa->spa_sync_on == B_FALSE); ASSERT(spa->spa_dsl_pool == NULL); ASSERT(spa->spa_root_vdev == NULL); ASSERT(spa->spa_async_zio_root == NULL); ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); spa_evicting_os_wait(spa); if (spa->spa_zvol_taskq) { taskq_destroy(spa->spa_zvol_taskq); spa->spa_zvol_taskq = NULL; } if (spa->spa_prefetch_taskq) { taskq_destroy(spa->spa_prefetch_taskq); spa->spa_prefetch_taskq = NULL; } if (spa->spa_upgrade_taskq) { taskq_destroy(spa->spa_upgrade_taskq); spa->spa_upgrade_taskq = NULL; } txg_list_destroy(&spa->spa_vdev_txg_list); list_destroy(&spa->spa_config_dirty_list); list_destroy(&spa->spa_evicting_os_list); list_destroy(&spa->spa_state_dirty_list); taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid); for (int t = 0; t < ZIO_TYPES; t++) { for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { spa_taskqs_fini(spa, t, q); } } for (size_t i = 0; i < TXG_SIZE; i++) { ASSERT3P(spa->spa_txg_zio[i], !=, NULL); VERIFY0(zio_wait(spa->spa_txg_zio[i])); spa->spa_txg_zio[i] = NULL; } metaslab_class_destroy(spa->spa_normal_class); spa->spa_normal_class = NULL; metaslab_class_destroy(spa->spa_log_class); spa->spa_log_class = NULL; metaslab_class_destroy(spa->spa_embedded_log_class); spa->spa_embedded_log_class = NULL; metaslab_class_destroy(spa->spa_special_class); spa->spa_special_class = NULL; metaslab_class_destroy(spa->spa_dedup_class); spa->spa_dedup_class = NULL; /* * If this was part of an import or the open otherwise failed, we may * still have errors left in the queues. Empty them just in case. */ spa_errlog_drain(spa); avl_destroy(&spa->spa_errlist_scrub); avl_destroy(&spa->spa_errlist_last); avl_destroy(&spa->spa_errlist_healed); spa_keystore_fini(&spa->spa_keystore); spa->spa_state = POOL_STATE_UNINITIALIZED; mutex_enter(&spa->spa_proc_lock); if (spa->spa_proc_state != SPA_PROC_NONE) { ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); spa->spa_proc_state = SPA_PROC_DEACTIVATE; cv_broadcast(&spa->spa_proc_cv); while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) { ASSERT(spa->spa_proc != &p0); cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); } ASSERT(spa->spa_proc_state == SPA_PROC_GONE); spa->spa_proc_state = SPA_PROC_NONE; } ASSERT(spa->spa_proc == &p0); mutex_exit(&spa->spa_proc_lock); /* * We want to make sure spa_thread() has actually exited the ZFS * module, so that the module can't be unloaded out from underneath * it. */ if (spa->spa_did != 0) { thread_join(spa->spa_did); spa->spa_did = 0; } spa_deactivate_os(spa); } /* * Verify a pool configuration, and construct the vdev tree appropriately. This * will create all the necessary vdevs in the appropriate layout, with each vdev * in the CLOSED state. This will prep the pool before open/creation/import. * All vdev validation is done by the vdev_alloc() routine. */ int spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, int atype) { nvlist_t **child; uint_t children; int error; if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) return (error); if ((*vdp)->vdev_ops->vdev_op_leaf) return (0); error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children); if (error == ENOENT) return (0); if (error) { vdev_free(*vdp); *vdp = NULL; return (SET_ERROR(EINVAL)); } for (int c = 0; c < children; c++) { vdev_t *vd; if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, atype)) != 0) { vdev_free(*vdp); *vdp = NULL; return (error); } } ASSERT(*vdp != NULL); return (0); } static boolean_t spa_should_flush_logs_on_unload(spa_t *spa) { if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) return (B_FALSE); if (!spa_writeable(spa)) return (B_FALSE); if (!spa->spa_sync_on) return (B_FALSE); if (spa_state(spa) != POOL_STATE_EXPORTED) return (B_FALSE); if (zfs_keep_log_spacemaps_at_export) return (B_FALSE); return (B_TRUE); } /* * Opens a transaction that will set the flag that will instruct * spa_sync to attempt to flush all the metaslabs for that txg. */ static void spa_unload_log_sm_flush_all(spa_t *spa) { dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); ASSERT3U(spa->spa_log_flushall_txg, ==, 0); spa->spa_log_flushall_txg = dmu_tx_get_txg(tx); dmu_tx_commit(tx); txg_wait_synced(spa_get_dsl(spa), spa->spa_log_flushall_txg); } static void spa_unload_log_sm_metadata(spa_t *spa) { void *cookie = NULL; spa_log_sm_t *sls; while ((sls = avl_destroy_nodes(&spa->spa_sm_logs_by_txg, &cookie)) != NULL) { VERIFY0(sls->sls_mscount); kmem_free(sls, sizeof (spa_log_sm_t)); } for (log_summary_entry_t *e = list_head(&spa->spa_log_summary); e != NULL; e = list_head(&spa->spa_log_summary)) { VERIFY0(e->lse_mscount); list_remove(&spa->spa_log_summary, e); kmem_free(e, sizeof (log_summary_entry_t)); } spa->spa_unflushed_stats.sus_nblocks = 0; spa->spa_unflushed_stats.sus_memused = 0; spa->spa_unflushed_stats.sus_blocklimit = 0; } static void spa_destroy_aux_threads(spa_t *spa) { if (spa->spa_condense_zthr != NULL) { zthr_destroy(spa->spa_condense_zthr); spa->spa_condense_zthr = NULL; } if (spa->spa_checkpoint_discard_zthr != NULL) { zthr_destroy(spa->spa_checkpoint_discard_zthr); spa->spa_checkpoint_discard_zthr = NULL; } if (spa->spa_livelist_delete_zthr != NULL) { zthr_destroy(spa->spa_livelist_delete_zthr); spa->spa_livelist_delete_zthr = NULL; } if (spa->spa_livelist_condense_zthr != NULL) { zthr_destroy(spa->spa_livelist_condense_zthr); spa->spa_livelist_condense_zthr = NULL; } } /* * Opposite of spa_load(). */ static void spa_unload(spa_t *spa) { ASSERT(MUTEX_HELD(&spa_namespace_lock)); ASSERT(spa_state(spa) != POOL_STATE_UNINITIALIZED); spa_import_progress_remove(spa_guid(spa)); spa_load_note(spa, "UNLOADING"); spa_wake_waiters(spa); /* * If we have set the spa_final_txg, we have already performed the * tasks below in spa_export_common(). We should not redo it here since * we delay the final TXGs beyond what spa_final_txg is set at. */ if (spa->spa_final_txg == UINT64_MAX) { /* * If the log space map feature is enabled and the pool is * getting exported (but not destroyed), we want to spend some * time flushing as many metaslabs as we can in an attempt to * destroy log space maps and save import time. */ if (spa_should_flush_logs_on_unload(spa)) spa_unload_log_sm_flush_all(spa); /* * Stop async tasks. */ spa_async_suspend(spa); if (spa->spa_root_vdev) { vdev_t *root_vdev = spa->spa_root_vdev; vdev_initialize_stop_all(root_vdev, VDEV_INITIALIZE_ACTIVE); vdev_trim_stop_all(root_vdev, VDEV_TRIM_ACTIVE); vdev_autotrim_stop_all(spa); vdev_rebuild_stop_all(spa); } } /* * Stop syncing. */ if (spa->spa_sync_on) { txg_sync_stop(spa->spa_dsl_pool); spa->spa_sync_on = B_FALSE; } /* * This ensures that there is no async metaslab prefetching * while we attempt to unload the spa. */ if (spa->spa_root_vdev != NULL) { for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) { vdev_t *vc = spa->spa_root_vdev->vdev_child[c]; if (vc->vdev_mg != NULL) taskq_wait(vc->vdev_mg->mg_taskq); } } if (spa->spa_mmp.mmp_thread) mmp_thread_stop(spa); /* * Wait for any outstanding async I/O to complete. */ if (spa->spa_async_zio_root != NULL) { for (int i = 0; i < max_ncpus; i++) (void) zio_wait(spa->spa_async_zio_root[i]); kmem_free(spa->spa_async_zio_root, max_ncpus * sizeof (void *)); spa->spa_async_zio_root = NULL; } if (spa->spa_vdev_removal != NULL) { spa_vdev_removal_destroy(spa->spa_vdev_removal); spa->spa_vdev_removal = NULL; } spa_destroy_aux_threads(spa); spa_condense_fini(spa); bpobj_close(&spa->spa_deferred_bpobj); spa_config_enter(spa, SCL_ALL, spa, RW_WRITER); /* * Close all vdevs. */ if (spa->spa_root_vdev) vdev_free(spa->spa_root_vdev); ASSERT(spa->spa_root_vdev == NULL); /* * Close the dsl pool. */ if (spa->spa_dsl_pool) { dsl_pool_close(spa->spa_dsl_pool); spa->spa_dsl_pool = NULL; spa->spa_meta_objset = NULL; } ddt_unload(spa); spa_unload_log_sm_metadata(spa); /* * Drop and purge level 2 cache */ spa_l2cache_drop(spa); for (int i = 0; i < spa->spa_spares.sav_count; i++) vdev_free(spa->spa_spares.sav_vdevs[i]); if (spa->spa_spares.sav_vdevs) { kmem_free(spa->spa_spares.sav_vdevs, spa->spa_spares.sav_count * sizeof (void *)); spa->spa_spares.sav_vdevs = NULL; } if (spa->spa_spares.sav_config) { nvlist_free(spa->spa_spares.sav_config); spa->spa_spares.sav_config = NULL; } spa->spa_spares.sav_count = 0; for (int i = 0; i < spa->spa_l2cache.sav_count; i++) { vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]); vdev_free(spa->spa_l2cache.sav_vdevs[i]); } if (spa->spa_l2cache.sav_vdevs) { kmem_free(spa->spa_l2cache.sav_vdevs, spa->spa_l2cache.sav_count * sizeof (void *)); spa->spa_l2cache.sav_vdevs = NULL; } if (spa->spa_l2cache.sav_config) { nvlist_free(spa->spa_l2cache.sav_config); spa->spa_l2cache.sav_config = NULL; } spa->spa_l2cache.sav_count = 0; spa->spa_async_suspended = 0; spa->spa_indirect_vdevs_loaded = B_FALSE; if (spa->spa_comment != NULL) { spa_strfree(spa->spa_comment); spa->spa_comment = NULL; } if (spa->spa_compatibility != NULL) { spa_strfree(spa->spa_compatibility); spa->spa_compatibility = NULL; } spa_config_exit(spa, SCL_ALL, spa); } /* * Load (or re-load) the current list of vdevs describing the active spares for * this pool. When this is called, we have some form of basic information in * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and * then re-generate a more complete list including status information. */ void spa_load_spares(spa_t *spa) { nvlist_t **spares; uint_t nspares; int i; vdev_t *vd, *tvd; #ifndef _KERNEL /* * zdb opens both the current state of the pool and the * checkpointed state (if present), with a different spa_t. * * As spare vdevs are shared among open pools, we skip loading * them when we load the checkpointed state of the pool. */ if (!spa_writeable(spa)) return; #endif ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); /* * First, close and free any existing spare vdevs. */ for (i = 0; i < spa->spa_spares.sav_count; i++) { vd = spa->spa_spares.sav_vdevs[i]; /* Undo the call to spa_activate() below */ if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, B_FALSE)) != NULL && tvd->vdev_isspare) spa_spare_remove(tvd); vdev_close(vd); vdev_free(vd); } if (spa->spa_spares.sav_vdevs) kmem_free(spa->spa_spares.sav_vdevs, spa->spa_spares.sav_count * sizeof (void *)); if (spa->spa_spares.sav_config == NULL) nspares = 0; else VERIFY0(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, &spares, &nspares)); spa->spa_spares.sav_count = (int)nspares; spa->spa_spares.sav_vdevs = NULL; if (nspares == 0) return; /* * Construct the array of vdevs, opening them to get status in the * process. For each spare, there is potentially two different vdev_t * structures associated with it: one in the list of spares (used only * for basic validation purposes) and one in the active vdev * configuration (if it's spared in). During this phase we open and * validate each vdev on the spare list. If the vdev also exists in the * active configuration, then we also mark this vdev as an active spare. */ spa->spa_spares.sav_vdevs = kmem_zalloc(nspares * sizeof (void *), KM_SLEEP); for (i = 0; i < spa->spa_spares.sav_count; i++) { VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, VDEV_ALLOC_SPARE) == 0); ASSERT(vd != NULL); spa->spa_spares.sav_vdevs[i] = vd; if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, B_FALSE)) != NULL) { if (!tvd->vdev_isspare) spa_spare_add(tvd); /* * We only mark the spare active if we were successfully * able to load the vdev. Otherwise, importing a pool * with a bad active spare would result in strange * behavior, because multiple pool would think the spare * is actively in use. * * There is a vulnerability here to an equally bizarre * circumstance, where a dead active spare is later * brought back to life (onlined or otherwise). Given * the rarity of this scenario, and the extra complexity * it adds, we ignore the possibility. */ if (!vdev_is_dead(tvd)) spa_spare_activate(tvd); } vd->vdev_top = vd; vd->vdev_aux = &spa->spa_spares; if (vdev_open(vd) != 0) continue; if (vdev_validate_aux(vd) == 0) spa_spare_add(vd); } /* * Recompute the stashed list of spares, with status information * this time. */ fnvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES); spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *), KM_SLEEP); for (i = 0; i < spa->spa_spares.sav_count; i++) spares[i] = vdev_config_generate(spa, spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE); fnvlist_add_nvlist_array(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, (const nvlist_t * const *)spares, spa->spa_spares.sav_count); for (i = 0; i < spa->spa_spares.sav_count; i++) nvlist_free(spares[i]); kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *)); } /* * Load (or re-load) the current list of vdevs describing the active l2cache for * this pool. When this is called, we have some form of basic information in * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and * then re-generate a more complete list including status information. * Devices which are already active have their details maintained, and are * not re-opened. */ void spa_load_l2cache(spa_t *spa) { nvlist_t **l2cache = NULL; uint_t nl2cache; int i, j, oldnvdevs; uint64_t guid; vdev_t *vd, **oldvdevs, **newvdevs; spa_aux_vdev_t *sav = &spa->spa_l2cache; #ifndef _KERNEL /* * zdb opens both the current state of the pool and the * checkpointed state (if present), with a different spa_t. * * As L2 caches are part of the ARC which is shared among open * pools, we skip loading them when we load the checkpointed * state of the pool. */ if (!spa_writeable(spa)) return; #endif ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); oldvdevs = sav->sav_vdevs; oldnvdevs = sav->sav_count; sav->sav_vdevs = NULL; sav->sav_count = 0; if (sav->sav_config == NULL) { nl2cache = 0; newvdevs = NULL; goto out; } VERIFY0(nvlist_lookup_nvlist_array(sav->sav_config, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache)); newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP); /* * Process new nvlist of vdevs. */ for (i = 0; i < nl2cache; i++) { guid = fnvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID); newvdevs[i] = NULL; for (j = 0; j < oldnvdevs; j++) { vd = oldvdevs[j]; if (vd != NULL && guid == vd->vdev_guid) { /* * Retain previous vdev for add/remove ops. */ newvdevs[i] = vd; oldvdevs[j] = NULL; break; } } if (newvdevs[i] == NULL) { /* * Create new vdev */ VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0, VDEV_ALLOC_L2CACHE) == 0); ASSERT(vd != NULL); newvdevs[i] = vd; /* * Commit this vdev as an l2cache device, * even if it fails to open. */ spa_l2cache_add(vd); vd->vdev_top = vd; vd->vdev_aux = sav; spa_l2cache_activate(vd); if (vdev_open(vd) != 0) continue; (void) vdev_validate_aux(vd); if (!vdev_is_dead(vd)) l2arc_add_vdev(spa, vd); /* * Upon cache device addition to a pool or pool * creation with a cache device or if the header * of the device is invalid we issue an async * TRIM command for the whole device which will * execute if l2arc_trim_ahead > 0. */ spa_async_request(spa, SPA_ASYNC_L2CACHE_TRIM); } } sav->sav_vdevs = newvdevs; sav->sav_count = (int)nl2cache; /* * Recompute the stashed list of l2cache devices, with status * information this time. */ fnvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE); if (sav->sav_count > 0) l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); for (i = 0; i < sav->sav_count; i++) l2cache[i] = vdev_config_generate(spa, sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE); fnvlist_add_nvlist_array(sav->sav_config, ZPOOL_CONFIG_L2CACHE, (const nvlist_t * const *)l2cache, sav->sav_count); out: /* * Purge vdevs that were dropped */ for (i = 0; i < oldnvdevs; i++) { uint64_t pool; vd = oldvdevs[i]; if (vd != NULL) { ASSERT(vd->vdev_isl2cache); if (spa_l2cache_exists(vd->vdev_guid, &pool) && pool != 0ULL && l2arc_vdev_present(vd)) l2arc_remove_vdev(vd); vdev_clear_stats(vd); vdev_free(vd); } } if (oldvdevs) kmem_free(oldvdevs, oldnvdevs * sizeof (void *)); for (i = 0; i < sav->sav_count; i++) nvlist_free(l2cache[i]); if (sav->sav_count) kmem_free(l2cache, sav->sav_count * sizeof (void *)); } static int load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) { dmu_buf_t *db; char *packed = NULL; size_t nvsize = 0; int error; *value = NULL; error = dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db); if (error) return (error); nvsize = *(uint64_t *)db->db_data; dmu_buf_rele(db, FTAG); packed = vmem_alloc(nvsize, KM_SLEEP); error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed, DMU_READ_PREFETCH); if (error == 0) error = nvlist_unpack(packed, nvsize, value, 0); vmem_free(packed, nvsize); return (error); } /* * Concrete top-level vdevs that are not missing and are not logs. At every * spa_sync we write new uberblocks to at least SPA_SYNC_MIN_VDEVS core tvds. */ static uint64_t spa_healthy_core_tvds(spa_t *spa) { vdev_t *rvd = spa->spa_root_vdev; uint64_t tvds = 0; for (uint64_t i = 0; i < rvd->vdev_children; i++) { vdev_t *vd = rvd->vdev_child[i]; if (vd->vdev_islog) continue; if (vdev_is_concrete(vd) && !vdev_is_dead(vd)) tvds++; } return (tvds); } /* * Checks to see if the given vdev could not be opened, in which case we post a * sysevent to notify the autoreplace code that the device has been removed. */ static void spa_check_removed(vdev_t *vd) { for (uint64_t c = 0; c < vd->vdev_children; c++) spa_check_removed(vd->vdev_child[c]); if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) && vdev_is_concrete(vd)) { zfs_post_autoreplace(vd->vdev_spa, vd); spa_event_notify(vd->vdev_spa, vd, NULL, ESC_ZFS_VDEV_CHECK); } } static int spa_check_for_missing_logs(spa_t *spa) { vdev_t *rvd = spa->spa_root_vdev; /* * If we're doing a normal import, then build up any additional * diagnostic information about missing log devices. * We'll pass this up to the user for further processing. */ if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) { nvlist_t **child, *nv; uint64_t idx = 0; child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t *), KM_SLEEP); nv = fnvlist_alloc(); for (uint64_t c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; /* * We consider a device as missing only if it failed * to open (i.e. offline or faulted is not considered * as missing). */ if (tvd->vdev_islog && tvd->vdev_state == VDEV_STATE_CANT_OPEN) { child[idx++] = vdev_config_generate(spa, tvd, B_FALSE, VDEV_CONFIG_MISSING); } } if (idx > 0) { fnvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, (const nvlist_t * const *)child, idx); fnvlist_add_nvlist(spa->spa_load_info, ZPOOL_CONFIG_MISSING_DEVICES, nv); for (uint64_t i = 0; i < idx; i++) nvlist_free(child[i]); } nvlist_free(nv); kmem_free(child, rvd->vdev_children * sizeof (char **)); if (idx > 0) { spa_load_failed(spa, "some log devices are missing"); vdev_dbgmsg_print_tree(rvd, 2); return (SET_ERROR(ENXIO)); } } else { for (uint64_t c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; if (tvd->vdev_islog && tvd->vdev_state == VDEV_STATE_CANT_OPEN) { spa_set_log_state(spa, SPA_LOG_CLEAR); spa_load_note(spa, "some log devices are " "missing, ZIL is dropped."); vdev_dbgmsg_print_tree(rvd, 2); break; } } } return (0); } /* * Check for missing log devices */ static boolean_t spa_check_logs(spa_t *spa) { boolean_t rv = B_FALSE; dsl_pool_t *dp = spa_get_dsl(spa); switch (spa->spa_log_state) { default: break; case SPA_LOG_MISSING: /* need to recheck in case slog has been restored */ case SPA_LOG_UNKNOWN: rv = (dmu_objset_find_dp(dp, dp->dp_root_dir_obj, zil_check_log_chain, NULL, DS_FIND_CHILDREN) != 0); if (rv) spa_set_log_state(spa, SPA_LOG_MISSING); break; } return (rv); } /* * Passivate any log vdevs (note, does not apply to embedded log metaslabs). */ static boolean_t spa_passivate_log(spa_t *spa) { vdev_t *rvd = spa->spa_root_vdev; boolean_t slog_found = B_FALSE; ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; if (tvd->vdev_islog) { ASSERT3P(tvd->vdev_log_mg, ==, NULL); metaslab_group_passivate(tvd->vdev_mg); slog_found = B_TRUE; } } return (slog_found); } /* * Activate any log vdevs (note, does not apply to embedded log metaslabs). */ static void spa_activate_log(spa_t *spa) { vdev_t *rvd = spa->spa_root_vdev; ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; if (tvd->vdev_islog) { ASSERT3P(tvd->vdev_log_mg, ==, NULL); metaslab_group_activate(tvd->vdev_mg); } } } int spa_reset_logs(spa_t *spa) { int error; error = dmu_objset_find(spa_name(spa), zil_reset, NULL, DS_FIND_CHILDREN); if (error == 0) { /* * We successfully offlined the log device, sync out the * current txg so that the "stubby" block can be removed * by zil_sync(). */ txg_wait_synced(spa->spa_dsl_pool, 0); } return (error); } static void spa_aux_check_removed(spa_aux_vdev_t *sav) { for (int i = 0; i < sav->sav_count; i++) spa_check_removed(sav->sav_vdevs[i]); } void spa_claim_notify(zio_t *zio) { spa_t *spa = zio->io_spa; if (zio->io_error) return; mutex_enter(&spa->spa_props_lock); /* any mutex will do */ if (spa->spa_claim_max_txg < zio->io_bp->blk_birth) spa->spa_claim_max_txg = zio->io_bp->blk_birth; mutex_exit(&spa->spa_props_lock); } typedef struct spa_load_error { boolean_t sle_verify_data; uint64_t sle_meta_count; uint64_t sle_data_count; } spa_load_error_t; static void spa_load_verify_done(zio_t *zio) { blkptr_t *bp = zio->io_bp; spa_load_error_t *sle = zio->io_private; dmu_object_type_t type = BP_GET_TYPE(bp); int error = zio->io_error; spa_t *spa = zio->io_spa; abd_free(zio->io_abd); if (error) { if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) && type != DMU_OT_INTENT_LOG) atomic_inc_64(&sle->sle_meta_count); else atomic_inc_64(&sle->sle_data_count); } mutex_enter(&spa->spa_scrub_lock); spa->spa_load_verify_bytes -= BP_GET_PSIZE(bp); cv_broadcast(&spa->spa_scrub_io_cv); mutex_exit(&spa->spa_scrub_lock); } /* * Maximum number of inflight bytes is the log2 fraction of the arc size. * By default, we set it to 1/16th of the arc. */ static uint_t spa_load_verify_shift = 4; static int spa_load_verify_metadata = B_TRUE; static int spa_load_verify_data = B_TRUE; static int spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) { zio_t *rio = arg; spa_load_error_t *sle = rio->io_private; (void) zilog, (void) dnp; /* * Note: normally this routine will not be called if * spa_load_verify_metadata is not set. However, it may be useful * to manually set the flag after the traversal has begun. */ if (!spa_load_verify_metadata) return (0); /* * Sanity check the block pointer in order to detect obvious damage * before using the contents in subsequent checks or in zio_read(). * When damaged consider it to be a metadata error since we cannot * trust the BP_GET_TYPE and BP_GET_LEVEL values. */ if (!zfs_blkptr_verify(spa, bp, B_FALSE, BLK_VERIFY_LOG)) { atomic_inc_64(&sle->sle_meta_count); return (0); } if (zb->zb_level == ZB_DNODE_LEVEL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp) || BP_IS_REDACTED(bp)) return (0); if (!BP_IS_METADATA(bp) && (!spa_load_verify_data || !sle->sle_verify_data)) return (0); uint64_t maxinflight_bytes = arc_target_bytes() >> spa_load_verify_shift; size_t size = BP_GET_PSIZE(bp); mutex_enter(&spa->spa_scrub_lock); while (spa->spa_load_verify_bytes >= maxinflight_bytes) cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); spa->spa_load_verify_bytes += size; mutex_exit(&spa->spa_scrub_lock); zio_nowait(zio_read(rio, spa, bp, abd_alloc_for_io(size, B_FALSE), size, spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB, ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb)); return (0); } static int verify_dataset_name_len(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) { (void) dp, (void) arg; if (dsl_dataset_namelen(ds) >= ZFS_MAX_DATASET_NAME_LEN) return (SET_ERROR(ENAMETOOLONG)); return (0); } static int spa_load_verify(spa_t *spa) { zio_t *rio; spa_load_error_t sle = { 0 }; zpool_load_policy_t policy; boolean_t verify_ok = B_FALSE; int error = 0; zpool_get_load_policy(spa->spa_config, &policy); if (policy.zlp_rewind & ZPOOL_NEVER_REWIND || policy.zlp_maxmeta == UINT64_MAX) return (0); dsl_pool_config_enter(spa->spa_dsl_pool, FTAG); error = dmu_objset_find_dp(spa->spa_dsl_pool, spa->spa_dsl_pool->dp_root_dir_obj, verify_dataset_name_len, NULL, DS_FIND_CHILDREN); dsl_pool_config_exit(spa->spa_dsl_pool, FTAG); if (error != 0) return (error); /* * Verify data only if we are rewinding or error limit was set. * Otherwise nothing except dbgmsg care about it to waste time. */ sle.sle_verify_data = (policy.zlp_rewind & ZPOOL_REWIND_MASK) || (policy.zlp_maxdata < UINT64_MAX); rio = zio_root(spa, NULL, &sle, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); if (spa_load_verify_metadata) { if (spa->spa_extreme_rewind) { spa_load_note(spa, "performing a complete scan of the " "pool since extreme rewind is on. This may take " "a very long time.\n (spa_load_verify_data=%u, " "spa_load_verify_metadata=%u)", spa_load_verify_data, spa_load_verify_metadata); } error = traverse_pool(spa, spa->spa_verify_min_txg, TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA | TRAVERSE_NO_DECRYPT, spa_load_verify_cb, rio); } (void) zio_wait(rio); ASSERT0(spa->spa_load_verify_bytes); spa->spa_load_meta_errors = sle.sle_meta_count; spa->spa_load_data_errors = sle.sle_data_count; if (sle.sle_meta_count != 0 || sle.sle_data_count != 0) { spa_load_note(spa, "spa_load_verify found %llu metadata errors " "and %llu data errors", (u_longlong_t)sle.sle_meta_count, (u_longlong_t)sle.sle_data_count); } if (spa_load_verify_dryrun || (!error && sle.sle_meta_count <= policy.zlp_maxmeta && sle.sle_data_count <= policy.zlp_maxdata)) { int64_t loss = 0; verify_ok = B_TRUE; spa->spa_load_txg = spa->spa_uberblock.ub_txg; spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp; loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts; fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_LOAD_TIME, spa->spa_load_txg_ts); fnvlist_add_int64(spa->spa_load_info, ZPOOL_CONFIG_REWIND_TIME, loss); fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_LOAD_META_ERRORS, sle.sle_meta_count); fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count); } else { spa->spa_load_max_txg = spa->spa_uberblock.ub_txg; } if (spa_load_verify_dryrun) return (0); if (error) { if (error != ENXIO && error != EIO) error = SET_ERROR(EIO); return (error); } return (verify_ok ? 0 : EIO); } /* * Find a value in the pool props object. */ static void spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val) { (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object, zpool_prop_to_name(prop), sizeof (uint64_t), 1, val); } /* * Find a value in the pool directory object. */ static int spa_dir_prop(spa_t *spa, const char *name, uint64_t *val, boolean_t log_enoent) { int error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, name, sizeof (uint64_t), 1, val); if (error != 0 && (error != ENOENT || log_enoent)) { spa_load_failed(spa, "couldn't get '%s' value in MOS directory " "[error=%d]", name, error); } return (error); } static int spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err) { vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux); return (SET_ERROR(err)); } boolean_t spa_livelist_delete_check(spa_t *spa) { return (spa->spa_livelists_to_delete != 0); } static boolean_t spa_livelist_delete_cb_check(void *arg, zthr_t *z) { (void) z; spa_t *spa = arg; return (spa_livelist_delete_check(spa)); } static int delete_blkptr_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) { spa_t *spa = arg; zio_free(spa, tx->tx_txg, bp); dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD, -bp_get_dsize_sync(spa, bp), -BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx); return (0); } static int dsl_get_next_livelist_obj(objset_t *os, uint64_t zap_obj, uint64_t *llp) { int err; zap_cursor_t zc; zap_attribute_t za; zap_cursor_init(&zc, os, zap_obj); err = zap_cursor_retrieve(&zc, &za); zap_cursor_fini(&zc); if (err == 0) *llp = za.za_first_integer; return (err); } /* * Components of livelist deletion that must be performed in syncing * context: freeing block pointers and updating the pool-wide data * structures to indicate how much work is left to do */ typedef struct sublist_delete_arg { spa_t *spa; dsl_deadlist_t *ll; uint64_t key; bplist_t *to_free; } sublist_delete_arg_t; static void sublist_delete_sync(void *arg, dmu_tx_t *tx) { sublist_delete_arg_t *sda = arg; spa_t *spa = sda->spa; dsl_deadlist_t *ll = sda->ll; uint64_t key = sda->key; bplist_t *to_free = sda->to_free; bplist_iterate(to_free, delete_blkptr_cb, spa, tx); dsl_deadlist_remove_entry(ll, key, tx); } typedef struct livelist_delete_arg { spa_t *spa; uint64_t ll_obj; uint64_t zap_obj; } livelist_delete_arg_t; static void livelist_delete_sync(void *arg, dmu_tx_t *tx) { livelist_delete_arg_t *lda = arg; spa_t *spa = lda->spa; uint64_t ll_obj = lda->ll_obj; uint64_t zap_obj = lda->zap_obj; objset_t *mos = spa->spa_meta_objset; uint64_t count; /* free the livelist and decrement the feature count */ VERIFY0(zap_remove_int(mos, zap_obj, ll_obj, tx)); dsl_deadlist_free(mos, ll_obj, tx); spa_feature_decr(spa, SPA_FEATURE_LIVELIST, tx); VERIFY0(zap_count(mos, zap_obj, &count)); if (count == 0) { /* no more livelists to delete */ VERIFY0(zap_remove(mos, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DELETED_CLONES, tx)); VERIFY0(zap_destroy(mos, zap_obj, tx)); spa->spa_livelists_to_delete = 0; spa_notify_waiters(spa); } } /* * Load in the value for the livelist to be removed and open it. Then, * load its first sublist and determine which block pointers should actually * be freed. Then, call a synctask which performs the actual frees and updates * the pool-wide livelist data. */ static void spa_livelist_delete_cb(void *arg, zthr_t *z) { spa_t *spa = arg; uint64_t ll_obj = 0, count; objset_t *mos = spa->spa_meta_objset; uint64_t zap_obj = spa->spa_livelists_to_delete; /* * Determine the next livelist to delete. This function should only * be called if there is at least one deleted clone. */ VERIFY0(dsl_get_next_livelist_obj(mos, zap_obj, &ll_obj)); VERIFY0(zap_count(mos, ll_obj, &count)); if (count > 0) { dsl_deadlist_t *ll; dsl_deadlist_entry_t *dle; bplist_t to_free; ll = kmem_zalloc(sizeof (dsl_deadlist_t), KM_SLEEP); dsl_deadlist_open(ll, mos, ll_obj); dle = dsl_deadlist_first(ll); ASSERT3P(dle, !=, NULL); bplist_create(&to_free); int err = dsl_process_sub_livelist(&dle->dle_bpobj, &to_free, z, NULL); if (err == 0) { sublist_delete_arg_t sync_arg = { .spa = spa, .ll = ll, .key = dle->dle_mintxg, .to_free = &to_free }; zfs_dbgmsg("deleting sublist (id %llu) from" " livelist %llu, %lld remaining", (u_longlong_t)dle->dle_bpobj.bpo_object, (u_longlong_t)ll_obj, (longlong_t)count - 1); VERIFY0(dsl_sync_task(spa_name(spa), NULL, sublist_delete_sync, &sync_arg, 0, ZFS_SPACE_CHECK_DESTROY)); } else { VERIFY3U(err, ==, EINTR); } bplist_clear(&to_free); bplist_destroy(&to_free); dsl_deadlist_close(ll); kmem_free(ll, sizeof (dsl_deadlist_t)); } else { livelist_delete_arg_t sync_arg = { .spa = spa, .ll_obj = ll_obj, .zap_obj = zap_obj }; zfs_dbgmsg("deletion of livelist %llu completed", (u_longlong_t)ll_obj); VERIFY0(dsl_sync_task(spa_name(spa), NULL, livelist_delete_sync, &sync_arg, 0, ZFS_SPACE_CHECK_DESTROY)); } } static void spa_start_livelist_destroy_thread(spa_t *spa) { ASSERT3P(spa->spa_livelist_delete_zthr, ==, NULL); spa->spa_livelist_delete_zthr = zthr_create("z_livelist_destroy", spa_livelist_delete_cb_check, spa_livelist_delete_cb, spa, minclsyspri); } typedef struct livelist_new_arg { bplist_t *allocs; bplist_t *frees; } livelist_new_arg_t; static int livelist_track_new_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) { ASSERT(tx == NULL); livelist_new_arg_t *lna = arg; if (bp_freed) { bplist_append(lna->frees, bp); } else { bplist_append(lna->allocs, bp); zfs_livelist_condense_new_alloc++; } return (0); } typedef struct livelist_condense_arg { spa_t *spa; bplist_t to_keep; uint64_t first_size; uint64_t next_size; } livelist_condense_arg_t; static void spa_livelist_condense_sync(void *arg, dmu_tx_t *tx) { livelist_condense_arg_t *lca = arg; spa_t *spa = lca->spa; bplist_t new_frees; dsl_dataset_t *ds = spa->spa_to_condense.ds; /* Have we been cancelled? */ if (spa->spa_to_condense.cancelled) { zfs_livelist_condense_sync_cancel++; goto out; } dsl_deadlist_entry_t *first = spa->spa_to_condense.first; dsl_deadlist_entry_t *next = spa->spa_to_condense.next; dsl_deadlist_t *ll = &ds->ds_dir->dd_livelist; /* * It's possible that the livelist was changed while the zthr was * running. Therefore, we need to check for new blkptrs in the two * entries being condensed and continue to track them in the livelist. * Because of the way we handle remapped blkptrs (see dbuf_remap_impl), * it's possible that the newly added blkptrs are FREEs or ALLOCs so * we need to sort them into two different bplists. */ uint64_t first_obj = first->dle_bpobj.bpo_object; uint64_t next_obj = next->dle_bpobj.bpo_object; uint64_t cur_first_size = first->dle_bpobj.bpo_phys->bpo_num_blkptrs; uint64_t cur_next_size = next->dle_bpobj.bpo_phys->bpo_num_blkptrs; bplist_create(&new_frees); livelist_new_arg_t new_bps = { .allocs = &lca->to_keep, .frees = &new_frees, }; if (cur_first_size > lca->first_size) { VERIFY0(livelist_bpobj_iterate_from_nofree(&first->dle_bpobj, livelist_track_new_cb, &new_bps, lca->first_size)); } if (cur_next_size > lca->next_size) { VERIFY0(livelist_bpobj_iterate_from_nofree(&next->dle_bpobj, livelist_track_new_cb, &new_bps, lca->next_size)); } dsl_deadlist_clear_entry(first, ll, tx); ASSERT(bpobj_is_empty(&first->dle_bpobj)); dsl_deadlist_remove_entry(ll, next->dle_mintxg, tx); bplist_iterate(&lca->to_keep, dsl_deadlist_insert_alloc_cb, ll, tx); bplist_iterate(&new_frees, dsl_deadlist_insert_free_cb, ll, tx); bplist_destroy(&new_frees); char dsname[ZFS_MAX_DATASET_NAME_LEN]; dsl_dataset_name(ds, dsname); zfs_dbgmsg("txg %llu condensing livelist of %s (id %llu), bpobj %llu " "(%llu blkptrs) and bpobj %llu (%llu blkptrs) -> bpobj %llu " "(%llu blkptrs)", (u_longlong_t)tx->tx_txg, dsname, (u_longlong_t)ds->ds_object, (u_longlong_t)first_obj, (u_longlong_t)cur_first_size, (u_longlong_t)next_obj, (u_longlong_t)cur_next_size, (u_longlong_t)first->dle_bpobj.bpo_object, (u_longlong_t)first->dle_bpobj.bpo_phys->bpo_num_blkptrs); out: dmu_buf_rele(ds->ds_dbuf, spa); spa->spa_to_condense.ds = NULL; bplist_clear(&lca->to_keep); bplist_destroy(&lca->to_keep); kmem_free(lca, sizeof (livelist_condense_arg_t)); spa->spa_to_condense.syncing = B_FALSE; } static void spa_livelist_condense_cb(void *arg, zthr_t *t) { while (zfs_livelist_condense_zthr_pause && !(zthr_has_waiters(t) || zthr_iscancelled(t))) delay(1); spa_t *spa = arg; dsl_deadlist_entry_t *first = spa->spa_to_condense.first; dsl_deadlist_entry_t *next = spa->spa_to_condense.next; uint64_t first_size, next_size; livelist_condense_arg_t *lca = kmem_alloc(sizeof (livelist_condense_arg_t), KM_SLEEP); bplist_create(&lca->to_keep); /* * Process the livelists (matching FREEs and ALLOCs) in open context * so we have minimal work in syncing context to condense. * * We save bpobj sizes (first_size and next_size) to use later in * syncing context to determine if entries were added to these sublists * while in open context. This is possible because the clone is still * active and open for normal writes and we want to make sure the new, * unprocessed blockpointers are inserted into the livelist normally. * * Note that dsl_process_sub_livelist() both stores the size number of * blockpointers and iterates over them while the bpobj's lock held, so * the sizes returned to us are consistent which what was actually * processed. */ int err = dsl_process_sub_livelist(&first->dle_bpobj, &lca->to_keep, t, &first_size); if (err == 0) err = dsl_process_sub_livelist(&next->dle_bpobj, &lca->to_keep, t, &next_size); if (err == 0) { while (zfs_livelist_condense_sync_pause && !(zthr_has_waiters(t) || zthr_iscancelled(t))) delay(1); dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); dmu_tx_mark_netfree(tx); dmu_tx_hold_space(tx, 1); err = dmu_tx_assign(tx, TXG_NOWAIT | TXG_NOTHROTTLE); if (err == 0) { /* * Prevent the condense zthr restarting before * the synctask completes. */ spa->spa_to_condense.syncing = B_TRUE; lca->spa = spa; lca->first_size = first_size; lca->next_size = next_size; dsl_sync_task_nowait(spa_get_dsl(spa), spa_livelist_condense_sync, lca, tx); dmu_tx_commit(tx); return; } } /* * Condensing can not continue: either it was externally stopped or * we were unable to assign to a tx because the pool has run out of * space. In the second case, we'll just end up trying to condense * again in a later txg. */ ASSERT(err != 0); bplist_clear(&lca->to_keep); bplist_destroy(&lca->to_keep); kmem_free(lca, sizeof (livelist_condense_arg_t)); dmu_buf_rele(spa->spa_to_condense.ds->ds_dbuf, spa); spa->spa_to_condense.ds = NULL; if (err == EINTR) zfs_livelist_condense_zthr_cancel++; } /* * Check that there is something to condense but that a condense is not * already in progress and that condensing has not been cancelled. */ static boolean_t spa_livelist_condense_cb_check(void *arg, zthr_t *z) { (void) z; spa_t *spa = arg; if ((spa->spa_to_condense.ds != NULL) && (spa->spa_to_condense.syncing == B_FALSE) && (spa->spa_to_condense.cancelled == B_FALSE)) { return (B_TRUE); } return (B_FALSE); } static void spa_start_livelist_condensing_thread(spa_t *spa) { spa->spa_to_condense.ds = NULL; spa->spa_to_condense.first = NULL; spa->spa_to_condense.next = NULL; spa->spa_to_condense.syncing = B_FALSE; spa->spa_to_condense.cancelled = B_FALSE; ASSERT3P(spa->spa_livelist_condense_zthr, ==, NULL); spa->spa_livelist_condense_zthr = zthr_create("z_livelist_condense", spa_livelist_condense_cb_check, spa_livelist_condense_cb, spa, minclsyspri); } static void spa_spawn_aux_threads(spa_t *spa) { ASSERT(spa_writeable(spa)); ASSERT(MUTEX_HELD(&spa_namespace_lock)); spa_start_indirect_condensing_thread(spa); spa_start_livelist_destroy_thread(spa); spa_start_livelist_condensing_thread(spa); ASSERT3P(spa->spa_checkpoint_discard_zthr, ==, NULL); spa->spa_checkpoint_discard_zthr = zthr_create("z_checkpoint_discard", spa_checkpoint_discard_thread_check, spa_checkpoint_discard_thread, spa, minclsyspri); } /* * Fix up config after a partly-completed split. This is done with the * ZPOOL_CONFIG_SPLIT nvlist. Both the splitting pool and the split-off * pool have that entry in their config, but only the splitting one contains * a list of all the guids of the vdevs that are being split off. * * This function determines what to do with that list: either rejoin * all the disks to the pool, or complete the splitting process. To attempt * the rejoin, each disk that is offlined is marked online again, and * we do a reopen() call. If the vdev label for every disk that was * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL) * then we call vdev_split() on each disk, and complete the split. * * Otherwise we leave the config alone, with all the vdevs in place in * the original pool. */ static void spa_try_repair(spa_t *spa, nvlist_t *config) { uint_t extracted; uint64_t *glist; uint_t i, gcount; nvlist_t *nvl; vdev_t **vd; boolean_t attempt_reopen; if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0) return; /* check that the config is complete */ if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, &glist, &gcount) != 0) return; vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP); /* attempt to online all the vdevs & validate */ attempt_reopen = B_TRUE; for (i = 0; i < gcount; i++) { if (glist[i] == 0) /* vdev is hole */ continue; vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE); if (vd[i] == NULL) { /* * Don't bother attempting to reopen the disks; * just do the split. */ attempt_reopen = B_FALSE; } else { /* attempt to re-online it */ vd[i]->vdev_offline = B_FALSE; } } if (attempt_reopen) { vdev_reopen(spa->spa_root_vdev); /* check each device to see what state it's in */ for (extracted = 0, i = 0; i < gcount; i++) { if (vd[i] != NULL && vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL) break; ++extracted; } } /* * If every disk has been moved to the new pool, or if we never * even attempted to look at them, then we split them off for * good. */ if (!attempt_reopen || gcount == extracted) { for (i = 0; i < gcount; i++) if (vd[i] != NULL) vdev_split(vd[i]); vdev_reopen(spa->spa_root_vdev); } kmem_free(vd, gcount * sizeof (vdev_t *)); } static int spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type) { const char *ereport = FM_EREPORT_ZFS_POOL; int error; spa->spa_load_state = state; (void) spa_import_progress_set_state(spa_guid(spa), spa_load_state(spa)); gethrestime(&spa->spa_loaded_ts); error = spa_load_impl(spa, type, &ereport); /* * Don't count references from objsets that are already closed * and are making their way through the eviction process. */ spa_evicting_os_wait(spa); spa->spa_minref = zfs_refcount_count(&spa->spa_refcount); if (error) { if (error != EEXIST) { spa->spa_loaded_ts.tv_sec = 0; spa->spa_loaded_ts.tv_nsec = 0; } if (error != EBADF) { (void) zfs_ereport_post(ereport, spa, NULL, NULL, NULL, 0); } } spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE; spa->spa_ena = 0; (void) spa_import_progress_set_state(spa_guid(spa), spa_load_state(spa)); return (error); } #ifdef ZFS_DEBUG /* * Count the number of per-vdev ZAPs associated with all of the vdevs in the * vdev tree rooted in the given vd, and ensure that each ZAP is present in the * spa's per-vdev ZAP list. */ static uint64_t vdev_count_verify_zaps(vdev_t *vd) { spa_t *spa = vd->vdev_spa; uint64_t total = 0; if (vd->vdev_top_zap != 0) { total++; ASSERT0(zap_lookup_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps, vd->vdev_top_zap)); } if (vd->vdev_leaf_zap != 0) { total++; ASSERT0(zap_lookup_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps, vd->vdev_leaf_zap)); } for (uint64_t i = 0; i < vd->vdev_children; i++) { total += vdev_count_verify_zaps(vd->vdev_child[i]); } return (total); } #else #define vdev_count_verify_zaps(vd) ((void) sizeof (vd), 0) #endif /* * Determine whether the activity check is required. */ static boolean_t spa_activity_check_required(spa_t *spa, uberblock_t *ub, nvlist_t *label, nvlist_t *config) { uint64_t state = 0; uint64_t hostid = 0; uint64_t tryconfig_txg = 0; uint64_t tryconfig_timestamp = 0; uint16_t tryconfig_mmp_seq = 0; nvlist_t *nvinfo; if (nvlist_exists(config, ZPOOL_CONFIG_LOAD_INFO)) { nvinfo = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO); (void) nvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_TXG, &tryconfig_txg); (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_TIMESTAMP, &tryconfig_timestamp); (void) nvlist_lookup_uint16(nvinfo, ZPOOL_CONFIG_MMP_SEQ, &tryconfig_mmp_seq); } (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, &state); /* * Disable the MMP activity check - This is used by zdb which * is intended to be used on potentially active pools. */ if (spa->spa_import_flags & ZFS_IMPORT_SKIP_MMP) return (B_FALSE); /* * Skip the activity check when the MMP feature is disabled. */ if (ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay == 0) return (B_FALSE); /* * If the tryconfig_ values are nonzero, they are the results of an * earlier tryimport. If they all match the uberblock we just found, * then the pool has not changed and we return false so we do not test * a second time. */ if (tryconfig_txg && tryconfig_txg == ub->ub_txg && tryconfig_timestamp && tryconfig_timestamp == ub->ub_timestamp && tryconfig_mmp_seq && tryconfig_mmp_seq == (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0)) return (B_FALSE); /* * Allow the activity check to be skipped when importing the pool * on the same host which last imported it. Since the hostid from * configuration may be stale use the one read from the label. */ if (nvlist_exists(label, ZPOOL_CONFIG_HOSTID)) hostid = fnvlist_lookup_uint64(label, ZPOOL_CONFIG_HOSTID); if (hostid == spa_get_hostid(spa)) return (B_FALSE); /* * Skip the activity test when the pool was cleanly exported. */ if (state != POOL_STATE_ACTIVE) return (B_FALSE); return (B_TRUE); } /* * Nanoseconds the activity check must watch for changes on-disk. */ static uint64_t spa_activity_check_duration(spa_t *spa, uberblock_t *ub) { uint64_t import_intervals = MAX(zfs_multihost_import_intervals, 1); uint64_t multihost_interval = MSEC2NSEC( MMP_INTERVAL_OK(zfs_multihost_interval)); uint64_t import_delay = MAX(NANOSEC, import_intervals * multihost_interval); /* * Local tunables determine a minimum duration except for the case * where we know when the remote host will suspend the pool if MMP * writes do not land. * * See Big Theory comment at the top of mmp.c for the reasoning behind * these cases and times. */ ASSERT(MMP_IMPORT_SAFETY_FACTOR >= 100); if (MMP_INTERVAL_VALID(ub) && MMP_FAIL_INT_VALID(ub) && MMP_FAIL_INT(ub) > 0) { /* MMP on remote host will suspend pool after failed writes */ import_delay = MMP_FAIL_INT(ub) * MSEC2NSEC(MMP_INTERVAL(ub)) * MMP_IMPORT_SAFETY_FACTOR / 100; zfs_dbgmsg("fail_intvals>0 import_delay=%llu ub_mmp " "mmp_fails=%llu ub_mmp mmp_interval=%llu " "import_intervals=%llu", (u_longlong_t)import_delay, (u_longlong_t)MMP_FAIL_INT(ub), (u_longlong_t)MMP_INTERVAL(ub), (u_longlong_t)import_intervals); } else if (MMP_INTERVAL_VALID(ub) && MMP_FAIL_INT_VALID(ub) && MMP_FAIL_INT(ub) == 0) { /* MMP on remote host will never suspend pool */ import_delay = MAX(import_delay, (MSEC2NSEC(MMP_INTERVAL(ub)) + ub->ub_mmp_delay) * import_intervals); zfs_dbgmsg("fail_intvals=0 import_delay=%llu ub_mmp " "mmp_interval=%llu ub_mmp_delay=%llu " "import_intervals=%llu", (u_longlong_t)import_delay, (u_longlong_t)MMP_INTERVAL(ub), (u_longlong_t)ub->ub_mmp_delay, (u_longlong_t)import_intervals); } else if (MMP_VALID(ub)) { /* * zfs-0.7 compatibility case */ import_delay = MAX(import_delay, (multihost_interval + ub->ub_mmp_delay) * import_intervals); zfs_dbgmsg("import_delay=%llu ub_mmp_delay=%llu " "import_intervals=%llu leaves=%u", (u_longlong_t)import_delay, (u_longlong_t)ub->ub_mmp_delay, (u_longlong_t)import_intervals, vdev_count_leaves(spa)); } else { /* Using local tunings is the only reasonable option */ zfs_dbgmsg("pool last imported on non-MMP aware " "host using import_delay=%llu multihost_interval=%llu " "import_intervals=%llu", (u_longlong_t)import_delay, (u_longlong_t)multihost_interval, (u_longlong_t)import_intervals); } return (import_delay); } /* * Perform the import activity check. If the user canceled the import or * we detected activity then fail. */ static int spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config) { uint64_t txg = ub->ub_txg; uint64_t timestamp = ub->ub_timestamp; uint64_t mmp_config = ub->ub_mmp_config; uint16_t mmp_seq = MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0; uint64_t import_delay; hrtime_t import_expire; nvlist_t *mmp_label = NULL; vdev_t *rvd = spa->spa_root_vdev; kcondvar_t cv; kmutex_t mtx; int error = 0; cv_init(&cv, NULL, CV_DEFAULT, NULL); mutex_init(&mtx, NULL, MUTEX_DEFAULT, NULL); mutex_enter(&mtx); /* * If ZPOOL_CONFIG_MMP_TXG is present an activity check was performed * during the earlier tryimport. If the txg recorded there is 0 then * the pool is known to be active on another host. * * Otherwise, the pool might be in use on another host. Check for * changes in the uberblocks on disk if necessary. */ if (nvlist_exists(config, ZPOOL_CONFIG_LOAD_INFO)) { nvlist_t *nvinfo = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO); if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_TXG) && fnvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_TXG) == 0) { vdev_uberblock_load(rvd, ub, &mmp_label); error = SET_ERROR(EREMOTEIO); goto out; } } import_delay = spa_activity_check_duration(spa, ub); /* Add a small random factor in case of simultaneous imports (0-25%) */ import_delay += import_delay * random_in_range(250) / 1000; import_expire = gethrtime() + import_delay; while (gethrtime() < import_expire) { (void) spa_import_progress_set_mmp_check(spa_guid(spa), NSEC2SEC(import_expire - gethrtime())); vdev_uberblock_load(rvd, ub, &mmp_label); if (txg != ub->ub_txg || timestamp != ub->ub_timestamp || mmp_seq != (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0)) { zfs_dbgmsg("multihost activity detected " "txg %llu ub_txg %llu " "timestamp %llu ub_timestamp %llu " "mmp_config %#llx ub_mmp_config %#llx", (u_longlong_t)txg, (u_longlong_t)ub->ub_txg, (u_longlong_t)timestamp, (u_longlong_t)ub->ub_timestamp, (u_longlong_t)mmp_config, (u_longlong_t)ub->ub_mmp_config); error = SET_ERROR(EREMOTEIO); break; } if (mmp_label) { nvlist_free(mmp_label); mmp_label = NULL; } error = cv_timedwait_sig(&cv, &mtx, ddi_get_lbolt() + hz); if (error != -1) { error = SET_ERROR(EINTR); break; } error = 0; } out: mutex_exit(&mtx); mutex_destroy(&mtx); cv_destroy(&cv); /* * If the pool is determined to be active store the status in the * spa->spa_load_info nvlist. If the remote hostname or hostid are * available from configuration read from disk store them as well. * This allows 'zpool import' to generate a more useful message. * * ZPOOL_CONFIG_MMP_STATE - observed pool status (mandatory) * ZPOOL_CONFIG_MMP_HOSTNAME - hostname from the active pool * ZPOOL_CONFIG_MMP_HOSTID - hostid from the active pool */ if (error == EREMOTEIO) { const char *hostname = ""; uint64_t hostid = 0; if (mmp_label) { if (nvlist_exists(mmp_label, ZPOOL_CONFIG_HOSTNAME)) { hostname = fnvlist_lookup_string(mmp_label, ZPOOL_CONFIG_HOSTNAME); fnvlist_add_string(spa->spa_load_info, ZPOOL_CONFIG_MMP_HOSTNAME, hostname); } if (nvlist_exists(mmp_label, ZPOOL_CONFIG_HOSTID)) { hostid = fnvlist_lookup_uint64(mmp_label, ZPOOL_CONFIG_HOSTID); fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_MMP_HOSTID, hostid); } } fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_MMP_STATE, MMP_STATE_ACTIVE); fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_MMP_TXG, 0); error = spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO); } if (mmp_label) nvlist_free(mmp_label); return (error); } static int spa_verify_host(spa_t *spa, nvlist_t *mos_config) { uint64_t hostid; char *hostname; uint64_t myhostid = 0; if (!spa_is_root(spa) && nvlist_lookup_uint64(mos_config, ZPOOL_CONFIG_HOSTID, &hostid) == 0) { hostname = fnvlist_lookup_string(mos_config, ZPOOL_CONFIG_HOSTNAME); myhostid = zone_get_hostid(NULL); if (hostid != 0 && myhostid != 0 && hostid != myhostid) { cmn_err(CE_WARN, "pool '%s' could not be " "loaded as it was last accessed by " "another system (host: %s hostid: 0x%llx). " "See: https://openzfs.github.io/openzfs-docs/msg/" "ZFS-8000-EY", spa_name(spa), hostname, (u_longlong_t)hostid); spa_load_failed(spa, "hostid verification failed: pool " "last accessed by host: %s (hostid: 0x%llx)", hostname, (u_longlong_t)hostid); return (SET_ERROR(EBADF)); } } return (0); } static int spa_ld_parse_config(spa_t *spa, spa_import_type_t type) { int error = 0; nvlist_t *nvtree, *nvl, *config = spa->spa_config; int parse; vdev_t *rvd; uint64_t pool_guid; char *comment; char *compatibility; /* * Versioning wasn't explicitly added to the label until later, so if * it's not present treat it as the initial version. */ if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &spa->spa_ubsync.ub_version) != 0) spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { spa_load_failed(spa, "invalid config provided: '%s' missing", ZPOOL_CONFIG_POOL_GUID); return (SET_ERROR(EINVAL)); } /* * If we are doing an import, ensure that the pool is not already * imported by checking if its pool guid already exists in the * spa namespace. * * The only case that we allow an already imported pool to be * imported again, is when the pool is checkpointed and we want to * look at its checkpointed state from userland tools like zdb. */ #ifdef _KERNEL if ((spa->spa_load_state == SPA_LOAD_IMPORT || spa->spa_load_state == SPA_LOAD_TRYIMPORT) && spa_guid_exists(pool_guid, 0)) { #else if ((spa->spa_load_state == SPA_LOAD_IMPORT || spa->spa_load_state == SPA_LOAD_TRYIMPORT) && spa_guid_exists(pool_guid, 0) && !spa_importing_readonly_checkpoint(spa)) { #endif spa_load_failed(spa, "a pool with guid %llu is already open", (u_longlong_t)pool_guid); return (SET_ERROR(EEXIST)); } spa->spa_config_guid = pool_guid; nvlist_free(spa->spa_load_info); spa->spa_load_info = fnvlist_alloc(); ASSERT(spa->spa_comment == NULL); if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0) spa->spa_comment = spa_strdup(comment); ASSERT(spa->spa_compatibility == NULL); if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMPATIBILITY, &compatibility) == 0) spa->spa_compatibility = spa_strdup(compatibility); (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &spa->spa_config_txg); if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) == 0) spa->spa_config_splitting = fnvlist_dup(nvl); if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvtree)) { spa_load_failed(spa, "invalid config provided: '%s' missing", ZPOOL_CONFIG_VDEV_TREE); return (SET_ERROR(EINVAL)); } /* * Create "The Godfather" zio to hold all async IOs */ spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *), KM_SLEEP); for (int i = 0; i < max_ncpus; i++) { spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); } /* * Parse the configuration into a vdev tree. We explicitly set the * value that will be returned by spa_version() since parsing the * configuration requires knowing the version number. */ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); parse = (type == SPA_IMPORT_EXISTING ? VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT); error = spa_config_parse(spa, &rvd, nvtree, NULL, 0, parse); spa_config_exit(spa, SCL_ALL, FTAG); if (error != 0) { spa_load_failed(spa, "unable to parse config [error=%d]", error); return (error); } ASSERT(spa->spa_root_vdev == rvd); ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT); ASSERT3U(spa->spa_max_ashift, <=, SPA_MAXBLOCKSHIFT); if (type != SPA_IMPORT_ASSEMBLE) { ASSERT(spa_guid(spa) == pool_guid); } return (0); } /* * Recursively open all vdevs in the vdev tree. This function is called twice: * first with the untrusted config, then with the trusted config. */ static int spa_ld_open_vdevs(spa_t *spa) { int error = 0; /* * spa_missing_tvds_allowed defines how many top-level vdevs can be * missing/unopenable for the root vdev to be still considered openable. */ if (spa->spa_trust_config) { spa->spa_missing_tvds_allowed = zfs_max_missing_tvds; } else if (spa->spa_config_source == SPA_CONFIG_SRC_CACHEFILE) { spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_cachefile; } else if (spa->spa_config_source == SPA_CONFIG_SRC_SCAN) { spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_scan; } else { spa->spa_missing_tvds_allowed = 0; } spa->spa_missing_tvds_allowed = MAX(zfs_max_missing_tvds, spa->spa_missing_tvds_allowed); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); error = vdev_open(spa->spa_root_vdev); spa_config_exit(spa, SCL_ALL, FTAG); if (spa->spa_missing_tvds != 0) { spa_load_note(spa, "vdev tree has %lld missing top-level " "vdevs.", (u_longlong_t)spa->spa_missing_tvds); if (spa->spa_trust_config && (spa->spa_mode & SPA_MODE_WRITE)) { /* * Although theoretically we could allow users to open * incomplete pools in RW mode, we'd need to add a lot * of extra logic (e.g. adjust pool space to account * for missing vdevs). * This limitation also prevents users from accidentally * opening the pool in RW mode during data recovery and * damaging it further. */ spa_load_note(spa, "pools with missing top-level " "vdevs can only be opened in read-only mode."); error = SET_ERROR(ENXIO); } else { spa_load_note(spa, "current settings allow for maximum " "%lld missing top-level vdevs at this stage.", (u_longlong_t)spa->spa_missing_tvds_allowed); } } if (error != 0) { spa_load_failed(spa, "unable to open vdev tree [error=%d]", error); } if (spa->spa_missing_tvds != 0 || error != 0) vdev_dbgmsg_print_tree(spa->spa_root_vdev, 2); return (error); } /* * We need to validate the vdev labels against the configuration that * we have in hand. This function is called twice: first with an untrusted * config, then with a trusted config. The validation is more strict when the * config is trusted. */ static int spa_ld_validate_vdevs(spa_t *spa) { int error = 0; vdev_t *rvd = spa->spa_root_vdev; spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); error = vdev_validate(rvd); spa_config_exit(spa, SCL_ALL, FTAG); if (error != 0) { spa_load_failed(spa, "vdev_validate failed [error=%d]", error); return (error); } if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { spa_load_failed(spa, "cannot open vdev tree after invalidating " "some vdevs"); vdev_dbgmsg_print_tree(rvd, 2); return (SET_ERROR(ENXIO)); } return (0); } static void spa_ld_select_uberblock_done(spa_t *spa, uberblock_t *ub) { spa->spa_state = POOL_STATE_ACTIVE; spa->spa_ubsync = spa->spa_uberblock; spa->spa_verify_min_txg = spa->spa_extreme_rewind ? TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1; spa->spa_first_txg = spa->spa_last_ubsync_txg ? spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1; spa->spa_claim_max_txg = spa->spa_first_txg; spa->spa_prev_software_version = ub->ub_software_version; } static int spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type) { vdev_t *rvd = spa->spa_root_vdev; nvlist_t *label; uberblock_t *ub = &spa->spa_uberblock; boolean_t activity_check = B_FALSE; /* * If we are opening the checkpointed state of the pool by * rewinding to it, at this point we will have written the * checkpointed uberblock to the vdev labels, so searching * the labels will find the right uberblock. However, if * we are opening the checkpointed state read-only, we have * not modified the labels. Therefore, we must ignore the * labels and continue using the spa_uberblock that was set * by spa_ld_checkpoint_rewind. * * Note that it would be fine to ignore the labels when * rewinding (opening writeable) as well. However, if we * crash just after writing the labels, we will end up * searching the labels. Doing so in the common case means * that this code path gets exercised normally, rather than * just in the edge case. */ if (ub->ub_checkpoint_txg != 0 && spa_importing_readonly_checkpoint(spa)) { spa_ld_select_uberblock_done(spa, ub); return (0); } /* * Find the best uberblock. */ vdev_uberblock_load(rvd, ub, &label); /* * If we weren't able to find a single valid uberblock, return failure. */ if (ub->ub_txg == 0) { nvlist_free(label); spa_load_failed(spa, "no valid uberblock found"); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); } if (spa->spa_load_max_txg != UINT64_MAX) { (void) spa_import_progress_set_max_txg(spa_guid(spa), (u_longlong_t)spa->spa_load_max_txg); } spa_load_note(spa, "using uberblock with txg=%llu", (u_longlong_t)ub->ub_txg); /* * For pools which have the multihost property on determine if the * pool is truly inactive and can be safely imported. Prevent * hosts which don't have a hostid set from importing the pool. */ activity_check = spa_activity_check_required(spa, ub, label, spa->spa_config); if (activity_check) { if (ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay && spa_get_hostid(spa) == 0) { nvlist_free(label); fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_MMP_STATE, MMP_STATE_NO_HOSTID); return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO)); } int error = spa_activity_check(spa, ub, spa->spa_config); if (error) { nvlist_free(label); return (error); } fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_MMP_STATE, MMP_STATE_INACTIVE); fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_MMP_TXG, ub->ub_txg); fnvlist_add_uint16(spa->spa_load_info, ZPOOL_CONFIG_MMP_SEQ, (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0)); } /* * If the pool has an unsupported version we can't open it. */ if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) { nvlist_free(label); spa_load_failed(spa, "version %llu is not supported", (u_longlong_t)ub->ub_version); return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP)); } if (ub->ub_version >= SPA_VERSION_FEATURES) { nvlist_t *features; /* * If we weren't able to find what's necessary for reading the * MOS in the label, return failure. */ if (label == NULL) { spa_load_failed(spa, "label config unavailable"); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); } if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_FEATURES_FOR_READ, &features) != 0) { nvlist_free(label); spa_load_failed(spa, "invalid label: '%s' missing", ZPOOL_CONFIG_FEATURES_FOR_READ); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); } /* * Update our in-core representation with the definitive values * from the label. */ nvlist_free(spa->spa_label_features); spa->spa_label_features = fnvlist_dup(features); } nvlist_free(label); /* * Look through entries in the label nvlist's features_for_read. If * there is a feature listed there which we don't understand then we * cannot open a pool. */ if (ub->ub_version >= SPA_VERSION_FEATURES) { nvlist_t *unsup_feat; unsup_feat = fnvlist_alloc(); for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features, NULL); nvp != NULL; nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) { if (!zfeature_is_supported(nvpair_name(nvp))) { fnvlist_add_string(unsup_feat, nvpair_name(nvp), ""); } } if (!nvlist_empty(unsup_feat)) { fnvlist_add_nvlist(spa->spa_load_info, ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat); nvlist_free(unsup_feat); spa_load_failed(spa, "some features are unsupported"); return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, ENOTSUP)); } nvlist_free(unsup_feat); } if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) { spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_try_repair(spa, spa->spa_config); spa_config_exit(spa, SCL_ALL, FTAG); nvlist_free(spa->spa_config_splitting); spa->spa_config_splitting = NULL; } /* * Initialize internal SPA structures. */ spa_ld_select_uberblock_done(spa, ub); return (0); } static int spa_ld_open_rootbp(spa_t *spa) { int error = 0; vdev_t *rvd = spa->spa_root_vdev; error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool); if (error != 0) { spa_load_failed(spa, "unable to open rootbp in dsl_pool_init " "[error=%d]", error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; return (0); } static int spa_ld_trusted_config(spa_t *spa, spa_import_type_t type, boolean_t reloading) { vdev_t *mrvd, *rvd = spa->spa_root_vdev; nvlist_t *nv, *mos_config, *policy; int error = 0, copy_error; uint64_t healthy_tvds, healthy_tvds_mos; uint64_t mos_config_txg; if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object, B_TRUE) != 0) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); /* * If we're assembling a pool from a split, the config provided is * already trusted so there is nothing to do. */ if (type == SPA_IMPORT_ASSEMBLE) return (0); healthy_tvds = spa_healthy_core_tvds(spa); if (load_nvlist(spa, spa->spa_config_object, &mos_config) != 0) { spa_load_failed(spa, "unable to retrieve MOS config"); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } /* * If we are doing an open, pool owner wasn't verified yet, thus do * the verification here. */ if (spa->spa_load_state == SPA_LOAD_OPEN) { error = spa_verify_host(spa, mos_config); if (error != 0) { nvlist_free(mos_config); return (error); } } nv = fnvlist_lookup_nvlist(mos_config, ZPOOL_CONFIG_VDEV_TREE); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); /* * Build a new vdev tree from the trusted config */ error = spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD); if (error != 0) { nvlist_free(mos_config); spa_config_exit(spa, SCL_ALL, FTAG); spa_load_failed(spa, "spa_config_parse failed [error=%d]", error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); } /* * Vdev paths in the MOS may be obsolete. If the untrusted config was * obtained by scanning /dev/dsk, then it will have the right vdev * paths. We update the trusted MOS config with this information. * We first try to copy the paths with vdev_copy_path_strict, which * succeeds only when both configs have exactly the same vdev tree. * If that fails, we fall back to a more flexible method that has a * best effort policy. */ copy_error = vdev_copy_path_strict(rvd, mrvd); if (copy_error != 0 || spa_load_print_vdev_tree) { spa_load_note(spa, "provided vdev tree:"); vdev_dbgmsg_print_tree(rvd, 2); spa_load_note(spa, "MOS vdev tree:"); vdev_dbgmsg_print_tree(mrvd, 2); } if (copy_error != 0) { spa_load_note(spa, "vdev_copy_path_strict failed, falling " "back to vdev_copy_path_relaxed"); vdev_copy_path_relaxed(rvd, mrvd); } vdev_close(rvd); vdev_free(rvd); spa->spa_root_vdev = mrvd; rvd = mrvd; spa_config_exit(spa, SCL_ALL, FTAG); /* * We will use spa_config if we decide to reload the spa or if spa_load * fails and we rewind. We must thus regenerate the config using the * MOS information with the updated paths. ZPOOL_LOAD_POLICY is used to * pass settings on how to load the pool and is not stored in the MOS. * We copy it over to our new, trusted config. */ mos_config_txg = fnvlist_lookup_uint64(mos_config, ZPOOL_CONFIG_POOL_TXG); nvlist_free(mos_config); mos_config = spa_config_generate(spa, NULL, mos_config_txg, B_FALSE); if (nvlist_lookup_nvlist(spa->spa_config, ZPOOL_LOAD_POLICY, &policy) == 0) fnvlist_add_nvlist(mos_config, ZPOOL_LOAD_POLICY, policy); spa_config_set(spa, mos_config); spa->spa_config_source = SPA_CONFIG_SRC_MOS; /* * Now that we got the config from the MOS, we should be more strict * in checking blkptrs and can make assumptions about the consistency * of the vdev tree. spa_trust_config must be set to true before opening * vdevs in order for them to be writeable. */ spa->spa_trust_config = B_TRUE; /* * Open and validate the new vdev tree */ error = spa_ld_open_vdevs(spa); if (error != 0) return (error); error = spa_ld_validate_vdevs(spa); if (error != 0) return (error); if (copy_error != 0 || spa_load_print_vdev_tree) { spa_load_note(spa, "final vdev tree:"); vdev_dbgmsg_print_tree(rvd, 2); } if (spa->spa_load_state != SPA_LOAD_TRYIMPORT && !spa->spa_extreme_rewind && zfs_max_missing_tvds == 0) { /* * Sanity check to make sure that we are indeed loading the * latest uberblock. If we missed SPA_SYNC_MIN_VDEVS tvds * in the config provided and they happened to be the only ones * to have the latest uberblock, we could involuntarily perform * an extreme rewind. */ healthy_tvds_mos = spa_healthy_core_tvds(spa); if (healthy_tvds_mos - healthy_tvds >= SPA_SYNC_MIN_VDEVS) { spa_load_note(spa, "config provided misses too many " "top-level vdevs compared to MOS (%lld vs %lld). ", (u_longlong_t)healthy_tvds, (u_longlong_t)healthy_tvds_mos); spa_load_note(spa, "vdev tree:"); vdev_dbgmsg_print_tree(rvd, 2); if (reloading) { spa_load_failed(spa, "config was already " "provided from MOS. Aborting."); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } spa_load_note(spa, "spa must be reloaded using MOS " "config"); return (SET_ERROR(EAGAIN)); } } error = spa_check_for_missing_logs(spa); if (error != 0) return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO)); if (rvd->vdev_guid_sum != spa->spa_uberblock.ub_guid_sum) { spa_load_failed(spa, "uberblock guid sum doesn't match MOS " "guid sum (%llu != %llu)", (u_longlong_t)spa->spa_uberblock.ub_guid_sum, (u_longlong_t)rvd->vdev_guid_sum); return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO)); } return (0); } static int spa_ld_open_indirect_vdev_metadata(spa_t *spa) { int error = 0; vdev_t *rvd = spa->spa_root_vdev; /* * Everything that we read before spa_remove_init() must be stored * on concreted vdevs. Therefore we do this as early as possible. */ error = spa_remove_init(spa); if (error != 0) { spa_load_failed(spa, "spa_remove_init failed [error=%d]", error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } /* * Retrieve information needed to condense indirect vdev mappings. */ error = spa_condense_init(spa); if (error != 0) { spa_load_failed(spa, "spa_condense_init failed [error=%d]", error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); } return (0); } static int spa_ld_check_features(spa_t *spa, boolean_t *missing_feat_writep) { int error = 0; vdev_t *rvd = spa->spa_root_vdev; if (spa_version(spa) >= SPA_VERSION_FEATURES) { boolean_t missing_feat_read = B_FALSE; nvlist_t *unsup_feat, *enabled_feat; if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ, &spa->spa_feat_for_read_obj, B_TRUE) != 0) { return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE, &spa->spa_feat_for_write_obj, B_TRUE) != 0) { return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS, &spa->spa_feat_desc_obj, B_TRUE) != 0) { return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } enabled_feat = fnvlist_alloc(); unsup_feat = fnvlist_alloc(); if (!spa_features_check(spa, B_FALSE, unsup_feat, enabled_feat)) missing_feat_read = B_TRUE; if (spa_writeable(spa) || spa->spa_load_state == SPA_LOAD_TRYIMPORT) { if (!spa_features_check(spa, B_TRUE, unsup_feat, enabled_feat)) { *missing_feat_writep = B_TRUE; } } fnvlist_add_nvlist(spa->spa_load_info, ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat); if (!nvlist_empty(unsup_feat)) { fnvlist_add_nvlist(spa->spa_load_info, ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat); } fnvlist_free(enabled_feat); fnvlist_free(unsup_feat); if (!missing_feat_read) { fnvlist_add_boolean(spa->spa_load_info, ZPOOL_CONFIG_CAN_RDONLY); } /* * If the state is SPA_LOAD_TRYIMPORT, our objective is * twofold: to determine whether the pool is available for * import in read-write mode and (if it is not) whether the * pool is available for import in read-only mode. If the pool * is available for import in read-write mode, it is displayed * as available in userland; if it is not available for import * in read-only mode, it is displayed as unavailable in * userland. If the pool is available for import in read-only * mode but not read-write mode, it is displayed as unavailable * in userland with a special note that the pool is actually * available for open in read-only mode. * * As a result, if the state is SPA_LOAD_TRYIMPORT and we are * missing a feature for write, we must first determine whether * the pool can be opened read-only before returning to * userland in order to know whether to display the * abovementioned note. */ if (missing_feat_read || (*missing_feat_writep && spa_writeable(spa))) { spa_load_failed(spa, "pool uses unsupported features"); return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, ENOTSUP)); } /* * Load refcounts for ZFS features from disk into an in-memory * cache during SPA initialization. */ for (spa_feature_t i = 0; i < SPA_FEATURES; i++) { uint64_t refcount; error = feature_get_refcount_from_disk(spa, &spa_feature_table[i], &refcount); if (error == 0) { spa->spa_feat_refcount_cache[i] = refcount; } else if (error == ENOTSUP) { spa->spa_feat_refcount_cache[i] = SPA_FEATURE_DISABLED; } else { spa_load_failed(spa, "error getting refcount " "for feature %s [error=%d]", spa_feature_table[i].fi_guid, error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } } } if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) { if (spa_dir_prop(spa, DMU_POOL_FEATURE_ENABLED_TXG, &spa->spa_feat_enabled_txg_obj, B_TRUE) != 0) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } /* * Encryption was added before bookmark_v2, even though bookmark_v2 * is now a dependency. If this pool has encryption enabled without * bookmark_v2, trigger an errata message. */ if (spa_feature_is_enabled(spa, SPA_FEATURE_ENCRYPTION) && !spa_feature_is_enabled(spa, SPA_FEATURE_BOOKMARK_V2)) { spa->spa_errata = ZPOOL_ERRATA_ZOL_8308_ENCRYPTION; } return (0); } static int spa_ld_load_special_directories(spa_t *spa) { int error = 0; vdev_t *rvd = spa->spa_root_vdev; spa->spa_is_initializing = B_TRUE; error = dsl_pool_open(spa->spa_dsl_pool); spa->spa_is_initializing = B_FALSE; if (error != 0) { spa_load_failed(spa, "dsl_pool_open failed [error=%d]", error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } return (0); } static int spa_ld_get_props(spa_t *spa) { int error = 0; uint64_t obj; vdev_t *rvd = spa->spa_root_vdev; /* Grab the checksum salt from the MOS. */ error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CHECKSUM_SALT, 1, sizeof (spa->spa_cksum_salt.zcs_bytes), spa->spa_cksum_salt.zcs_bytes); if (error == ENOENT) { /* Generate a new salt for subsequent use */ (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes, sizeof (spa->spa_cksum_salt.zcs_bytes)); } else if (error != 0) { spa_load_failed(spa, "unable to retrieve checksum salt from " "MOS [error=%d]", error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj, B_TRUE) != 0) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj); if (error != 0) { spa_load_failed(spa, "error opening deferred-frees bpobj " "[error=%d]", error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } /* * Load the bit that tells us to use the new accounting function * (raid-z deflation). If we have an older pool, this will not * be present. */ error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate, B_FALSE); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION, &spa->spa_creation_version, B_FALSE); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); /* * Load the persistent error log. If we have an older pool, this will * not be present. */ error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last, B_FALSE); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB, &spa->spa_errlog_scrub, B_FALSE); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); /* * Load the livelist deletion field. If a livelist is queued for * deletion, indicate that in the spa */ error = spa_dir_prop(spa, DMU_POOL_DELETED_CLONES, &spa->spa_livelists_to_delete, B_FALSE); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); /* * Load the history object. If we have an older pool, this * will not be present. */ error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history, B_FALSE); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); /* * Load the per-vdev ZAP map. If we have an older pool, this will not * be present; in this case, defer its creation to a later time to * avoid dirtying the MOS this early / out of sync context. See * spa_sync_config_object. */ /* The sentinel is only available in the MOS config. */ nvlist_t *mos_config; if (load_nvlist(spa, spa->spa_config_object, &mos_config) != 0) { spa_load_failed(spa, "unable to retrieve MOS config"); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } error = spa_dir_prop(spa, DMU_POOL_VDEV_ZAP_MAP, &spa->spa_all_vdev_zaps, B_FALSE); if (error == ENOENT) { VERIFY(!nvlist_exists(mos_config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)); spa->spa_avz_action = AVZ_ACTION_INITIALIZE; ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev)); } else if (error != 0) { nvlist_free(mos_config); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } else if (!nvlist_exists(mos_config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)) { /* * An older version of ZFS overwrote the sentinel value, so * we have orphaned per-vdev ZAPs in the MOS. Defer their * destruction to later; see spa_sync_config_object. */ spa->spa_avz_action = AVZ_ACTION_DESTROY; /* * We're assuming that no vdevs have had their ZAPs created * before this. Better be sure of it. */ ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev)); } nvlist_free(mos_config); spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object, B_FALSE); if (error && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); if (error == 0) { uint64_t autoreplace = 0; spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs); spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace); spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation); spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode); spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand); spa_prop_find(spa, ZPOOL_PROP_MULTIHOST, &spa->spa_multihost); spa_prop_find(spa, ZPOOL_PROP_AUTOTRIM, &spa->spa_autotrim); spa->spa_autoreplace = (autoreplace != 0); } /* * If we are importing a pool with missing top-level vdevs, * we enforce that the pool doesn't panic or get suspended on * error since the likelihood of missing data is extremely high. */ if (spa->spa_missing_tvds > 0 && spa->spa_failmode != ZIO_FAILURE_MODE_CONTINUE && spa->spa_load_state != SPA_LOAD_TRYIMPORT) { spa_load_note(spa, "forcing failmode to 'continue' " "as some top level vdevs are missing"); spa->spa_failmode = ZIO_FAILURE_MODE_CONTINUE; } return (0); } static int spa_ld_open_aux_vdevs(spa_t *spa, spa_import_type_t type) { int error = 0; vdev_t *rvd = spa->spa_root_vdev; /* * If we're assembling the pool from the split-off vdevs of * an existing pool, we don't want to attach the spares & cache * devices. */ /* * Load any hot spares for this pool. */ error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object, B_FALSE); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); if (load_nvlist(spa, spa->spa_spares.sav_object, &spa->spa_spares.sav_config) != 0) { spa_load_failed(spa, "error loading spares nvlist"); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_load_spares(spa); spa_config_exit(spa, SCL_ALL, FTAG); } else if (error == 0) { spa->spa_spares.sav_sync = B_TRUE; } /* * Load any level 2 ARC devices for this pool. */ error = spa_dir_prop(spa, DMU_POOL_L2CACHE, &spa->spa_l2cache.sav_object, B_FALSE); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE); if (load_nvlist(spa, spa->spa_l2cache.sav_object, &spa->spa_l2cache.sav_config) != 0) { spa_load_failed(spa, "error loading l2cache nvlist"); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_load_l2cache(spa); spa_config_exit(spa, SCL_ALL, FTAG); } else if (error == 0) { spa->spa_l2cache.sav_sync = B_TRUE; } return (0); } static int spa_ld_load_vdev_metadata(spa_t *spa) { int error = 0; vdev_t *rvd = spa->spa_root_vdev; /* * If the 'multihost' property is set, then never allow a pool to * be imported when the system hostid is zero. The exception to * this rule is zdb which is always allowed to access pools. */ if (spa_multihost(spa) && spa_get_hostid(spa) == 0 && (spa->spa_import_flags & ZFS_IMPORT_SKIP_MMP) == 0) { fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_MMP_STATE, MMP_STATE_NO_HOSTID); return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO)); } /* * If the 'autoreplace' property is set, then post a resource notifying * the ZFS DE that it should not issue any faults for unopenable * devices. We also iterate over the vdevs, and post a sysevent for any * unopenable vdevs so that the normal autoreplace handler can take * over. */ if (spa->spa_autoreplace && spa->spa_load_state != SPA_LOAD_TRYIMPORT) { spa_check_removed(spa->spa_root_vdev); /* * For the import case, this is done in spa_import(), because * at this point we're using the spare definitions from * the MOS config, not necessarily from the userland config. */ if (spa->spa_load_state != SPA_LOAD_IMPORT) { spa_aux_check_removed(&spa->spa_spares); spa_aux_check_removed(&spa->spa_l2cache); } } /* * Load the vdev metadata such as metaslabs, DTLs, spacemap object, etc. */ error = vdev_load(rvd); if (error != 0) { spa_load_failed(spa, "vdev_load failed [error=%d]", error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); } error = spa_ld_log_spacemaps(spa); if (error != 0) { spa_load_failed(spa, "spa_ld_log_spacemaps failed [error=%d]", error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); } /* * Propagate the leaf DTLs we just loaded all the way up the vdev tree. */ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); vdev_dtl_reassess(rvd, 0, 0, B_FALSE, B_FALSE); spa_config_exit(spa, SCL_ALL, FTAG); return (0); } static int spa_ld_load_dedup_tables(spa_t *spa) { int error = 0; vdev_t *rvd = spa->spa_root_vdev; error = ddt_load(spa); if (error != 0) { spa_load_failed(spa, "ddt_load failed [error=%d]", error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } return (0); } static int spa_ld_verify_logs(spa_t *spa, spa_import_type_t type, const char **ereport) { vdev_t *rvd = spa->spa_root_vdev; if (type != SPA_IMPORT_ASSEMBLE && spa_writeable(spa)) { boolean_t missing = spa_check_logs(spa); if (missing) { if (spa->spa_missing_tvds != 0) { spa_load_note(spa, "spa_check_logs failed " "so dropping the logs"); } else { *ereport = FM_EREPORT_ZFS_LOG_REPLAY; spa_load_failed(spa, "spa_check_logs failed"); return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO)); } } } return (0); } static int spa_ld_verify_pool_data(spa_t *spa) { int error = 0; vdev_t *rvd = spa->spa_root_vdev; /* * We've successfully opened the pool, verify that we're ready * to start pushing transactions. */ if (spa->spa_load_state != SPA_LOAD_TRYIMPORT) { error = spa_load_verify(spa); if (error != 0) { spa_load_failed(spa, "spa_load_verify failed " "[error=%d]", error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); } } return (0); } static void spa_ld_claim_log_blocks(spa_t *spa) { dmu_tx_t *tx; dsl_pool_t *dp = spa_get_dsl(spa); /* * Claim log blocks that haven't been committed yet. * This must all happen in a single txg. * Note: spa_claim_max_txg is updated by spa_claim_notify(), * invoked from zil_claim_log_block()'s i/o done callback. * Price of rollback is that we abandon the log. */ spa->spa_claiming = B_TRUE; tx = dmu_tx_create_assigned(dp, spa_first_txg(spa)); (void) dmu_objset_find_dp(dp, dp->dp_root_dir_obj, zil_claim, tx, DS_FIND_CHILDREN); dmu_tx_commit(tx); spa->spa_claiming = B_FALSE; spa_set_log_state(spa, SPA_LOG_GOOD); } static void spa_ld_check_for_config_update(spa_t *spa, uint64_t config_cache_txg, boolean_t update_config_cache) { vdev_t *rvd = spa->spa_root_vdev; int need_update = B_FALSE; /* * If the config cache is stale, or we have uninitialized * metaslabs (see spa_vdev_add()), then update the config. * * If this is a verbatim import, trust the current * in-core spa_config and update the disk labels. */ if (update_config_cache || config_cache_txg != spa->spa_config_txg || spa->spa_load_state == SPA_LOAD_IMPORT || spa->spa_load_state == SPA_LOAD_RECOVER || (spa->spa_import_flags & ZFS_IMPORT_VERBATIM)) need_update = B_TRUE; for (int c = 0; c < rvd->vdev_children; c++) if (rvd->vdev_child[c]->vdev_ms_array == 0) need_update = B_TRUE; /* * Update the config cache asynchronously in case we're the * root pool, in which case the config cache isn't writable yet. */ if (need_update) spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); } static void spa_ld_prepare_for_reload(spa_t *spa) { spa_mode_t mode = spa->spa_mode; int async_suspended = spa->spa_async_suspended; spa_unload(spa); spa_deactivate(spa); spa_activate(spa, mode); /* * We save the value of spa_async_suspended as it gets reset to 0 by * spa_unload(). We want to restore it back to the original value before * returning as we might be calling spa_async_resume() later. */ spa->spa_async_suspended = async_suspended; } static int spa_ld_read_checkpoint_txg(spa_t *spa) { uberblock_t checkpoint; int error = 0; ASSERT0(spa->spa_checkpoint_txg); ASSERT(MUTEX_HELD(&spa_namespace_lock)); error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t), sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint); if (error == ENOENT) return (0); if (error != 0) return (error); ASSERT3U(checkpoint.ub_txg, !=, 0); ASSERT3U(checkpoint.ub_checkpoint_txg, !=, 0); ASSERT3U(checkpoint.ub_timestamp, !=, 0); spa->spa_checkpoint_txg = checkpoint.ub_txg; spa->spa_checkpoint_info.sci_timestamp = checkpoint.ub_timestamp; return (0); } static int spa_ld_mos_init(spa_t *spa, spa_import_type_t type) { int error = 0; ASSERT(MUTEX_HELD(&spa_namespace_lock)); ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE); /* * Never trust the config that is provided unless we are assembling * a pool following a split. * This means don't trust blkptrs and the vdev tree in general. This * also effectively puts the spa in read-only mode since * spa_writeable() checks for spa_trust_config to be true. * We will later load a trusted config from the MOS. */ if (type != SPA_IMPORT_ASSEMBLE) spa->spa_trust_config = B_FALSE; /* * Parse the config provided to create a vdev tree. */ error = spa_ld_parse_config(spa, type); if (error != 0) return (error); spa_import_progress_add(spa); /* * Now that we have the vdev tree, try to open each vdev. This involves * opening the underlying physical device, retrieving its geometry and * probing the vdev with a dummy I/O. The state of each vdev will be set * based on the success of those operations. After this we'll be ready * to read from the vdevs. */ error = spa_ld_open_vdevs(spa); if (error != 0) return (error); /* * Read the label of each vdev and make sure that the GUIDs stored * there match the GUIDs in the config provided. * If we're assembling a new pool that's been split off from an * existing pool, the labels haven't yet been updated so we skip * validation for now. */ if (type != SPA_IMPORT_ASSEMBLE) { error = spa_ld_validate_vdevs(spa); if (error != 0) return (error); } /* * Read all vdev labels to find the best uberblock (i.e. latest, * unless spa_load_max_txg is set) and store it in spa_uberblock. We * get the list of features required to read blkptrs in the MOS from * the vdev label with the best uberblock and verify that our version * of zfs supports them all. */ error = spa_ld_select_uberblock(spa, type); if (error != 0) return (error); /* * Pass that uberblock to the dsl_pool layer which will open the root * blkptr. This blkptr points to the latest version of the MOS and will * allow us to read its contents. */ error = spa_ld_open_rootbp(spa); if (error != 0) return (error); return (0); } static int spa_ld_checkpoint_rewind(spa_t *spa) { uberblock_t checkpoint; int error = 0; ASSERT(MUTEX_HELD(&spa_namespace_lock)); ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT); error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t), sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint); if (error != 0) { spa_load_failed(spa, "unable to retrieve checkpointed " "uberblock from the MOS config [error=%d]", error); if (error == ENOENT) error = ZFS_ERR_NO_CHECKPOINT; return (error); } ASSERT3U(checkpoint.ub_txg, <, spa->spa_uberblock.ub_txg); ASSERT3U(checkpoint.ub_txg, ==, checkpoint.ub_checkpoint_txg); /* * We need to update the txg and timestamp of the checkpointed * uberblock to be higher than the latest one. This ensures that * the checkpointed uberblock is selected if we were to close and * reopen the pool right after we've written it in the vdev labels. * (also see block comment in vdev_uberblock_compare) */ checkpoint.ub_txg = spa->spa_uberblock.ub_txg + 1; checkpoint.ub_timestamp = gethrestime_sec(); /* * Set current uberblock to be the checkpointed uberblock. */ spa->spa_uberblock = checkpoint; /* * If we are doing a normal rewind, then the pool is open for * writing and we sync the "updated" checkpointed uberblock to * disk. Once this is done, we've basically rewound the whole * pool and there is no way back. * * There are cases when we don't want to attempt and sync the * checkpointed uberblock to disk because we are opening a * pool as read-only. Specifically, verifying the checkpointed * state with zdb, and importing the checkpointed state to get * a "preview" of its content. */ if (spa_writeable(spa)) { vdev_t *rvd = spa->spa_root_vdev; spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL }; int svdcount = 0; int children = rvd->vdev_children; int c0 = random_in_range(children); for (int c = 0; c < children; c++) { vdev_t *vd = rvd->vdev_child[(c0 + c) % children]; /* Stop when revisiting the first vdev */ if (c > 0 && svd[0] == vd) break; if (vd->vdev_ms_array == 0 || vd->vdev_islog || !vdev_is_concrete(vd)) continue; svd[svdcount++] = vd; if (svdcount == SPA_SYNC_MIN_VDEVS) break; } error = vdev_config_sync(svd, svdcount, spa->spa_first_txg); if (error == 0) spa->spa_last_synced_guid = rvd->vdev_guid; spa_config_exit(spa, SCL_ALL, FTAG); if (error != 0) { spa_load_failed(spa, "failed to write checkpointed " "uberblock to the vdev labels [error=%d]", error); return (error); } } return (0); } static int spa_ld_mos_with_trusted_config(spa_t *spa, spa_import_type_t type, boolean_t *update_config_cache) { int error; /* * Parse the config for pool, open and validate vdevs, * select an uberblock, and use that uberblock to open * the MOS. */ error = spa_ld_mos_init(spa, type); if (error != 0) return (error); /* * Retrieve the trusted config stored in the MOS and use it to create * a new, exact version of the vdev tree, then reopen all vdevs. */ error = spa_ld_trusted_config(spa, type, B_FALSE); if (error == EAGAIN) { if (update_config_cache != NULL) *update_config_cache = B_TRUE; /* * Redo the loading process with the trusted config if it is * too different from the untrusted config. */ spa_ld_prepare_for_reload(spa); spa_load_note(spa, "RELOADING"); error = spa_ld_mos_init(spa, type); if (error != 0) return (error); error = spa_ld_trusted_config(spa, type, B_TRUE); if (error != 0) return (error); } else if (error != 0) { return (error); } return (0); } /* * Load an existing storage pool, using the config provided. This config * describes which vdevs are part of the pool and is later validated against * partial configs present in each vdev's label and an entire copy of the * config stored in the MOS. */ static int spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport) { int error = 0; boolean_t missing_feat_write = B_FALSE; boolean_t checkpoint_rewind = (spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT); boolean_t update_config_cache = B_FALSE; ASSERT(MUTEX_HELD(&spa_namespace_lock)); ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE); spa_load_note(spa, "LOADING"); error = spa_ld_mos_with_trusted_config(spa, type, &update_config_cache); if (error != 0) return (error); /* * If we are rewinding to the checkpoint then we need to repeat * everything we've done so far in this function but this time * selecting the checkpointed uberblock and using that to open * the MOS. */ if (checkpoint_rewind) { /* * If we are rewinding to the checkpoint update config cache * anyway. */ update_config_cache = B_TRUE; /* * Extract the checkpointed uberblock from the current MOS * and use this as the pool's uberblock from now on. If the * pool is imported as writeable we also write the checkpoint * uberblock to the labels, making the rewind permanent. */ error = spa_ld_checkpoint_rewind(spa); if (error != 0) return (error); /* * Redo the loading process again with the * checkpointed uberblock. */ spa_ld_prepare_for_reload(spa); spa_load_note(spa, "LOADING checkpointed uberblock"); error = spa_ld_mos_with_trusted_config(spa, type, NULL); if (error != 0) return (error); } /* * Retrieve the checkpoint txg if the pool has a checkpoint. */ error = spa_ld_read_checkpoint_txg(spa); if (error != 0) return (error); /* * Retrieve the mapping of indirect vdevs. Those vdevs were removed * from the pool and their contents were re-mapped to other vdevs. Note * that everything that we read before this step must have been * rewritten on concrete vdevs after the last device removal was * initiated. Otherwise we could be reading from indirect vdevs before * we have loaded their mappings. */ error = spa_ld_open_indirect_vdev_metadata(spa); if (error != 0) return (error); /* * Retrieve the full list of active features from the MOS and check if * they are all supported. */ error = spa_ld_check_features(spa, &missing_feat_write); if (error != 0) return (error); /* * Load several special directories from the MOS needed by the dsl_pool * layer. */ error = spa_ld_load_special_directories(spa); if (error != 0) return (error); /* * Retrieve pool properties from the MOS. */ error = spa_ld_get_props(spa); if (error != 0) return (error); /* * Retrieve the list of auxiliary devices - cache devices and spares - * and open them. */ error = spa_ld_open_aux_vdevs(spa, type); if (error != 0) return (error); /* * Load the metadata for all vdevs. Also check if unopenable devices * should be autoreplaced. */ error = spa_ld_load_vdev_metadata(spa); if (error != 0) return (error); error = spa_ld_load_dedup_tables(spa); if (error != 0) return (error); /* * Verify the logs now to make sure we don't have any unexpected errors * when we claim log blocks later. */ error = spa_ld_verify_logs(spa, type, ereport); if (error != 0) return (error); if (missing_feat_write) { ASSERT(spa->spa_load_state == SPA_LOAD_TRYIMPORT); /* * At this point, we know that we can open the pool in * read-only mode but not read-write mode. We now have enough * information and can return to userland. */ return (spa_vdev_err(spa->spa_root_vdev, VDEV_AUX_UNSUP_FEAT, ENOTSUP)); } /* * Traverse the last txgs to make sure the pool was left off in a safe * state. When performing an extreme rewind, we verify the whole pool, * which can take a very long time. */ error = spa_ld_verify_pool_data(spa); if (error != 0) return (error); /* * Calculate the deflated space for the pool. This must be done before * we write anything to the pool because we'd need to update the space * accounting using the deflated sizes. */ spa_update_dspace(spa); /* * We have now retrieved all the information we needed to open the * pool. If we are importing the pool in read-write mode, a few * additional steps must be performed to finish the import. */ if (spa_writeable(spa) && (spa->spa_load_state == SPA_LOAD_RECOVER || spa->spa_load_max_txg == UINT64_MAX)) { uint64_t config_cache_txg = spa->spa_config_txg; ASSERT(spa->spa_load_state != SPA_LOAD_TRYIMPORT); /* * In case of a checkpoint rewind, log the original txg * of the checkpointed uberblock. */ if (checkpoint_rewind) { spa_history_log_internal(spa, "checkpoint rewind", NULL, "rewound state to txg=%llu", (u_longlong_t)spa->spa_uberblock.ub_checkpoint_txg); } /* * Traverse the ZIL and claim all blocks. */ spa_ld_claim_log_blocks(spa); /* * Kick-off the syncing thread. */ spa->spa_sync_on = B_TRUE; txg_sync_start(spa->spa_dsl_pool); mmp_thread_start(spa); /* * Wait for all claims to sync. We sync up to the highest * claimed log block birth time so that claimed log blocks * don't appear to be from the future. spa_claim_max_txg * will have been set for us by ZIL traversal operations * performed above. */ txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg); /* * Check if we need to request an update of the config. On the * next sync, we would update the config stored in vdev labels * and the cachefile (by default /etc/zfs/zpool.cache). */ spa_ld_check_for_config_update(spa, config_cache_txg, update_config_cache); /* * Check if a rebuild was in progress and if so resume it. * Then check all DTLs to see if anything needs resilvering. * The resilver will be deferred if a rebuild was started. */ if (vdev_rebuild_active(spa->spa_root_vdev)) { vdev_rebuild_restart(spa); } else if (!dsl_scan_resilvering(spa->spa_dsl_pool) && vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { spa_async_request(spa, SPA_ASYNC_RESILVER); } /* * Log the fact that we booted up (so that we can detect if * we rebooted in the middle of an operation). */ spa_history_log_version(spa, "open", NULL); spa_restart_removal(spa); spa_spawn_aux_threads(spa); /* * Delete any inconsistent datasets. * * Note: * Since we may be issuing deletes for clones here, * we make sure to do so after we've spawned all the * auxiliary threads above (from which the livelist * deletion zthr is part of). */ (void) dmu_objset_find(spa_name(spa), dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN); /* * Clean up any stale temporary dataset userrefs. */ dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); vdev_initialize_restart(spa->spa_root_vdev); vdev_trim_restart(spa->spa_root_vdev); vdev_autotrim_restart(spa); spa_config_exit(spa, SCL_CONFIG, FTAG); } spa_import_progress_remove(spa_guid(spa)); spa_async_request(spa, SPA_ASYNC_L2CACHE_REBUILD); spa_load_note(spa, "LOADED"); return (0); } static int spa_load_retry(spa_t *spa, spa_load_state_t state) { spa_mode_t mode = spa->spa_mode; spa_unload(spa); spa_deactivate(spa); spa->spa_load_max_txg = spa->spa_uberblock.ub_txg - 1; spa_activate(spa, mode); spa_async_suspend(spa); spa_load_note(spa, "spa_load_retry: rewind, max txg: %llu", (u_longlong_t)spa->spa_load_max_txg); return (spa_load(spa, state, SPA_IMPORT_EXISTING)); } /* * If spa_load() fails this function will try loading prior txg's. If * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this * function will not rewind the pool and will return the same error as * spa_load(). */ static int spa_load_best(spa_t *spa, spa_load_state_t state, uint64_t max_request, int rewind_flags) { nvlist_t *loadinfo = NULL; nvlist_t *config = NULL; int load_error, rewind_error; uint64_t safe_rewind_txg; uint64_t min_txg; if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) { spa->spa_load_max_txg = spa->spa_load_txg; spa_set_log_state(spa, SPA_LOG_CLEAR); } else { spa->spa_load_max_txg = max_request; if (max_request != UINT64_MAX) spa->spa_extreme_rewind = B_TRUE; } load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING); if (load_error == 0) return (0); if (load_error == ZFS_ERR_NO_CHECKPOINT) { /* * When attempting checkpoint-rewind on a pool with no * checkpoint, we should not attempt to load uberblocks * from previous txgs when spa_load fails. */ ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT); spa_import_progress_remove(spa_guid(spa)); return (load_error); } if (spa->spa_root_vdev != NULL) config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg; spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp; if (rewind_flags & ZPOOL_NEVER_REWIND) { nvlist_free(config); spa_import_progress_remove(spa_guid(spa)); return (load_error); } if (state == SPA_LOAD_RECOVER) { /* Price of rolling back is discarding txgs, including log */ spa_set_log_state(spa, SPA_LOG_CLEAR); } else { /* * If we aren't rolling back save the load info from our first * import attempt so that we can restore it after attempting * to rewind. */ loadinfo = spa->spa_load_info; spa->spa_load_info = fnvlist_alloc(); } spa->spa_load_max_txg = spa->spa_last_ubsync_txg; safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE; min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ? TXG_INITIAL : safe_rewind_txg; /* * Continue as long as we're finding errors, we're still within * the acceptable rewind range, and we're still finding uberblocks */ while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg && spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) { if (spa->spa_load_max_txg < safe_rewind_txg) spa->spa_extreme_rewind = B_TRUE; rewind_error = spa_load_retry(spa, state); } spa->spa_extreme_rewind = B_FALSE; spa->spa_load_max_txg = UINT64_MAX; if (config && (rewind_error || state != SPA_LOAD_RECOVER)) spa_config_set(spa, config); else nvlist_free(config); if (state == SPA_LOAD_RECOVER) { ASSERT3P(loadinfo, ==, NULL); spa_import_progress_remove(spa_guid(spa)); return (rewind_error); } else { /* Store the rewind info as part of the initial load info */ fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO, spa->spa_load_info); /* Restore the initial load info */ fnvlist_free(spa->spa_load_info); spa->spa_load_info = loadinfo; spa_import_progress_remove(spa_guid(spa)); return (load_error); } } /* * Pool Open/Import * * The import case is identical to an open except that the configuration is sent * down from userland, instead of grabbed from the configuration cache. For the * case of an open, the pool configuration will exist in the * POOL_STATE_UNINITIALIZED state. * * The stats information (gen/count/ustats) is used to gather vdev statistics at * the same time open the pool, without having to keep around the spa_t in some * ambiguous state. */ static int spa_open_common(const char *pool, spa_t **spapp, const void *tag, nvlist_t *nvpolicy, nvlist_t **config) { spa_t *spa; spa_load_state_t state = SPA_LOAD_OPEN; int error; int locked = B_FALSE; int firstopen = B_FALSE; *spapp = NULL; /* * As disgusting as this is, we need to support recursive calls to this * function because dsl_dir_open() is called during spa_load(), and ends * up calling spa_open() again. The real fix is to figure out how to * avoid dsl_dir_open() calling this in the first place. */ if (MUTEX_NOT_HELD(&spa_namespace_lock)) { mutex_enter(&spa_namespace_lock); locked = B_TRUE; } if ((spa = spa_lookup(pool)) == NULL) { if (locked) mutex_exit(&spa_namespace_lock); return (SET_ERROR(ENOENT)); } if (spa->spa_state == POOL_STATE_UNINITIALIZED) { zpool_load_policy_t policy; firstopen = B_TRUE; zpool_get_load_policy(nvpolicy ? nvpolicy : spa->spa_config, &policy); if (policy.zlp_rewind & ZPOOL_DO_REWIND) state = SPA_LOAD_RECOVER; spa_activate(spa, spa_mode_global); if (state != SPA_LOAD_RECOVER) spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE; zfs_dbgmsg("spa_open_common: opening %s", pool); error = spa_load_best(spa, state, policy.zlp_txg, policy.zlp_rewind); if (error == EBADF) { /* * If vdev_validate() returns failure (indicated by * EBADF), it indicates that one of the vdevs indicates * that the pool has been exported or destroyed. If * this is the case, the config cache is out of sync and * we should remove the pool from the namespace. */ spa_unload(spa); spa_deactivate(spa); spa_write_cachefile(spa, B_TRUE, B_TRUE, B_FALSE); spa_remove(spa); if (locked) mutex_exit(&spa_namespace_lock); return (SET_ERROR(ENOENT)); } if (error) { /* * We can't open the pool, but we still have useful * information: the state of each vdev after the * attempted vdev_open(). Return this to the user. */ if (config != NULL && spa->spa_config) { *config = fnvlist_dup(spa->spa_config); fnvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO, spa->spa_load_info); } spa_unload(spa); spa_deactivate(spa); spa->spa_last_open_failed = error; if (locked) mutex_exit(&spa_namespace_lock); *spapp = NULL; return (error); } } spa_open_ref(spa, tag); if (config != NULL) *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); /* * If we've recovered the pool, pass back any information we * gathered while doing the load. */ if (state == SPA_LOAD_RECOVER) { fnvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO, spa->spa_load_info); } if (locked) { spa->spa_last_open_failed = 0; spa->spa_last_ubsync_txg = 0; spa->spa_load_txg = 0; mutex_exit(&spa_namespace_lock); } if (firstopen) zvol_create_minors_recursive(spa_name(spa)); *spapp = spa; return (0); } int spa_open_rewind(const char *name, spa_t **spapp, const void *tag, nvlist_t *policy, nvlist_t **config) { return (spa_open_common(name, spapp, tag, policy, config)); } int spa_open(const char *name, spa_t **spapp, const void *tag) { return (spa_open_common(name, spapp, tag, NULL, NULL)); } /* * Lookup the given spa_t, incrementing the inject count in the process, * preventing it from being exported or destroyed. */ spa_t * spa_inject_addref(char *name) { spa_t *spa; mutex_enter(&spa_namespace_lock); if ((spa = spa_lookup(name)) == NULL) { mutex_exit(&spa_namespace_lock); return (NULL); } spa->spa_inject_ref++; mutex_exit(&spa_namespace_lock); return (spa); } void spa_inject_delref(spa_t *spa) { mutex_enter(&spa_namespace_lock); spa->spa_inject_ref--; mutex_exit(&spa_namespace_lock); } /* * Add spares device information to the nvlist. */ static void spa_add_spares(spa_t *spa, nvlist_t *config) { nvlist_t **spares; uint_t i, nspares; nvlist_t *nvroot; uint64_t guid; vdev_stat_t *vs; uint_t vsc; uint64_t pool; ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); if (spa->spa_spares.sav_count == 0) return; nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); VERIFY0(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, &spares, &nspares)); if (nspares != 0) { fnvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, (const nvlist_t * const *)spares, nspares); VERIFY0(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, &nspares)); /* * Go through and find any spares which have since been * repurposed as an active spare. If this is the case, update * their status appropriately. */ for (i = 0; i < nspares; i++) { guid = fnvlist_lookup_uint64(spares[i], ZPOOL_CONFIG_GUID); if (spa_spare_exists(guid, &pool, NULL) && pool != 0ULL) { VERIFY0(nvlist_lookup_uint64_array(spares[i], ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc)); vs->vs_state = VDEV_STATE_CANT_OPEN; vs->vs_aux = VDEV_AUX_SPARED; } } } } /* * Add l2cache device information to the nvlist, including vdev stats. */ static void spa_add_l2cache(spa_t *spa, nvlist_t *config) { nvlist_t **l2cache; uint_t i, j, nl2cache; nvlist_t *nvroot; uint64_t guid; vdev_t *vd; vdev_stat_t *vs; uint_t vsc; ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); if (spa->spa_l2cache.sav_count == 0) return; nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); VERIFY0(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache)); if (nl2cache != 0) { fnvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, (const nvlist_t * const *)l2cache, nl2cache); VERIFY0(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache)); /* * Update level 2 cache device stats. */ for (i = 0; i < nl2cache; i++) { guid = fnvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID); vd = NULL; for (j = 0; j < spa->spa_l2cache.sav_count; j++) { if (guid == spa->spa_l2cache.sav_vdevs[j]->vdev_guid) { vd = spa->spa_l2cache.sav_vdevs[j]; break; } } ASSERT(vd != NULL); VERIFY0(nvlist_lookup_uint64_array(l2cache[i], ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc)); vdev_get_stats(vd, vs); vdev_config_generate_stats(vd, l2cache[i]); } } } static void spa_feature_stats_from_disk(spa_t *spa, nvlist_t *features) { zap_cursor_t zc; zap_attribute_t za; if (spa->spa_feat_for_read_obj != 0) { for (zap_cursor_init(&zc, spa->spa_meta_objset, spa->spa_feat_for_read_obj); zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { ASSERT(za.za_integer_length == sizeof (uint64_t) && za.za_num_integers == 1); VERIFY0(nvlist_add_uint64(features, za.za_name, za.za_first_integer)); } zap_cursor_fini(&zc); } if (spa->spa_feat_for_write_obj != 0) { for (zap_cursor_init(&zc, spa->spa_meta_objset, spa->spa_feat_for_write_obj); zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { ASSERT(za.za_integer_length == sizeof (uint64_t) && za.za_num_integers == 1); VERIFY0(nvlist_add_uint64(features, za.za_name, za.za_first_integer)); } zap_cursor_fini(&zc); } } static void spa_feature_stats_from_cache(spa_t *spa, nvlist_t *features) { int i; for (i = 0; i < SPA_FEATURES; i++) { zfeature_info_t feature = spa_feature_table[i]; uint64_t refcount; if (feature_get_refcount(spa, &feature, &refcount) != 0) continue; VERIFY0(nvlist_add_uint64(features, feature.fi_guid, refcount)); } } /* * Store a list of pool features and their reference counts in the * config. * * The first time this is called on a spa, allocate a new nvlist, fetch * the pool features and reference counts from disk, then save the list * in the spa. In subsequent calls on the same spa use the saved nvlist * and refresh its values from the cached reference counts. This * ensures we don't block here on I/O on a suspended pool so 'zpool * clear' can resume the pool. */ static void spa_add_feature_stats(spa_t *spa, nvlist_t *config) { nvlist_t *features; ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); mutex_enter(&spa->spa_feat_stats_lock); features = spa->spa_feat_stats; if (features != NULL) { spa_feature_stats_from_cache(spa, features); } else { VERIFY0(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP)); spa->spa_feat_stats = features; spa_feature_stats_from_disk(spa, features); } VERIFY0(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS, features)); mutex_exit(&spa->spa_feat_stats_lock); } int spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) { int error; spa_t *spa; *config = NULL; error = spa_open_common(name, &spa, FTAG, NULL, config); if (spa != NULL) { /* * This still leaves a window of inconsistency where the spares * or l2cache devices could change and the config would be * self-inconsistent. */ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); if (*config != NULL) { uint64_t loadtimes[2]; loadtimes[0] = spa->spa_loaded_ts.tv_sec; loadtimes[1] = spa->spa_loaded_ts.tv_nsec; fnvlist_add_uint64_array(*config, ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2); fnvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT, spa_get_errlog_size(spa)); if (spa_suspended(spa)) { fnvlist_add_uint64(*config, ZPOOL_CONFIG_SUSPENDED, spa->spa_failmode); fnvlist_add_uint64(*config, ZPOOL_CONFIG_SUSPENDED_REASON, spa->spa_suspended); } spa_add_spares(spa, *config); spa_add_l2cache(spa, *config); spa_add_feature_stats(spa, *config); } } /* * We want to get the alternate root even for faulted pools, so we cheat * and call spa_lookup() directly. */ if (altroot) { if (spa == NULL) { mutex_enter(&spa_namespace_lock); spa = spa_lookup(name); if (spa) spa_altroot(spa, altroot, buflen); else altroot[0] = '\0'; spa = NULL; mutex_exit(&spa_namespace_lock); } else { spa_altroot(spa, altroot, buflen); } } if (spa != NULL) { spa_config_exit(spa, SCL_CONFIG, FTAG); spa_close(spa, FTAG); } return (error); } /* * Validate that the auxiliary device array is well formed. We must have an * array of nvlists, each which describes a valid leaf vdev. If this is an * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be * specified, as long as they are well-formed. */ static int spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode, spa_aux_vdev_t *sav, const char *config, uint64_t version, vdev_labeltype_t label) { nvlist_t **dev; uint_t i, ndev; vdev_t *vd; int error; ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); /* * It's acceptable to have no devs specified. */ if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0) return (0); if (ndev == 0) return (SET_ERROR(EINVAL)); /* * Make sure the pool is formatted with a version that supports this * device type. */ if (spa_version(spa) < version) return (SET_ERROR(ENOTSUP)); /* * Set the pending device list so we correctly handle device in-use * checking. */ sav->sav_pending = dev; sav->sav_npending = ndev; for (i = 0; i < ndev; i++) { if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0, mode)) != 0) goto out; if (!vd->vdev_ops->vdev_op_leaf) { vdev_free(vd); error = SET_ERROR(EINVAL); goto out; } vd->vdev_top = vd; if ((error = vdev_open(vd)) == 0 && (error = vdev_label_init(vd, crtxg, label)) == 0) { fnvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID, vd->vdev_guid); } vdev_free(vd); if (error && (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE)) goto out; else error = 0; } out: sav->sav_pending = NULL; sav->sav_npending = 0; return (error); } static int spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) { int error; ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode, &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES, VDEV_LABEL_SPARE)) != 0) { return (error); } return (spa_validate_aux_devs(spa, nvroot, crtxg, mode, &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE, VDEV_LABEL_L2CACHE)); } static void spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs, const char *config) { int i; if (sav->sav_config != NULL) { nvlist_t **olddevs; uint_t oldndevs; nvlist_t **newdevs; /* * Generate new dev list by concatenating with the * current dev list. */ VERIFY0(nvlist_lookup_nvlist_array(sav->sav_config, config, &olddevs, &oldndevs)); newdevs = kmem_alloc(sizeof (void *) * (ndevs + oldndevs), KM_SLEEP); for (i = 0; i < oldndevs; i++) newdevs[i] = fnvlist_dup(olddevs[i]); for (i = 0; i < ndevs; i++) newdevs[i + oldndevs] = fnvlist_dup(devs[i]); fnvlist_remove(sav->sav_config, config); fnvlist_add_nvlist_array(sav->sav_config, config, (const nvlist_t * const *)newdevs, ndevs + oldndevs); for (i = 0; i < oldndevs + ndevs; i++) nvlist_free(newdevs[i]); kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *)); } else { /* * Generate a new dev list. */ sav->sav_config = fnvlist_alloc(); fnvlist_add_nvlist_array(sav->sav_config, config, (const nvlist_t * const *)devs, ndevs); } } /* * Stop and drop level 2 ARC devices */ void spa_l2cache_drop(spa_t *spa) { vdev_t *vd; int i; spa_aux_vdev_t *sav = &spa->spa_l2cache; for (i = 0; i < sav->sav_count; i++) { uint64_t pool; vd = sav->sav_vdevs[i]; ASSERT(vd != NULL); if (spa_l2cache_exists(vd->vdev_guid, &pool) && pool != 0ULL && l2arc_vdev_present(vd)) l2arc_remove_vdev(vd); } } /* * Verify encryption parameters for spa creation. If we are encrypting, we must * have the encryption feature flag enabled. */ static int spa_create_check_encryption_params(dsl_crypto_params_t *dcp, boolean_t has_encryption) { if (dcp->cp_crypt != ZIO_CRYPT_OFF && dcp->cp_crypt != ZIO_CRYPT_INHERIT && !has_encryption) return (SET_ERROR(ENOTSUP)); return (dmu_objset_create_crypt_check(NULL, dcp, NULL)); } /* * Pool Creation */ int spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, nvlist_t *zplprops, dsl_crypto_params_t *dcp) { spa_t *spa; char *altroot = NULL; vdev_t *rvd; dsl_pool_t *dp; dmu_tx_t *tx; int error = 0; uint64_t txg = TXG_INITIAL; nvlist_t **spares, **l2cache; uint_t nspares, nl2cache; uint64_t version, obj, ndraid = 0; boolean_t has_features; boolean_t has_encryption; boolean_t has_allocclass; spa_feature_t feat; char *feat_name; char *poolname; nvlist_t *nvl; if (props == NULL || nvlist_lookup_string(props, "tname", &poolname) != 0) poolname = (char *)pool; /* * If this pool already exists, return failure. */ mutex_enter(&spa_namespace_lock); if (spa_lookup(poolname) != NULL) { mutex_exit(&spa_namespace_lock); return (SET_ERROR(EEXIST)); } /* * Allocate a new spa_t structure. */ nvl = fnvlist_alloc(); fnvlist_add_string(nvl, ZPOOL_CONFIG_POOL_NAME, pool); (void) nvlist_lookup_string(props, zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); spa = spa_add(poolname, nvl, altroot); fnvlist_free(nvl); spa_activate(spa, spa_mode_global); if (props && (error = spa_prop_validate(spa, props))) { spa_deactivate(spa); spa_remove(spa); mutex_exit(&spa_namespace_lock); return (error); } /* * Temporary pool names should never be written to disk. */ if (poolname != pool) spa->spa_import_flags |= ZFS_IMPORT_TEMP_NAME; has_features = B_FALSE; has_encryption = B_FALSE; has_allocclass = B_FALSE; for (nvpair_t *elem = nvlist_next_nvpair(props, NULL); elem != NULL; elem = nvlist_next_nvpair(props, elem)) { if (zpool_prop_feature(nvpair_name(elem))) { has_features = B_TRUE; feat_name = strchr(nvpair_name(elem), '@') + 1; VERIFY0(zfeature_lookup_name(feat_name, &feat)); if (feat == SPA_FEATURE_ENCRYPTION) has_encryption = B_TRUE; if (feat == SPA_FEATURE_ALLOCATION_CLASSES) has_allocclass = B_TRUE; } } /* verify encryption params, if they were provided */ if (dcp != NULL) { error = spa_create_check_encryption_params(dcp, has_encryption); if (error != 0) { spa_deactivate(spa); spa_remove(spa); mutex_exit(&spa_namespace_lock); return (error); } } if (!has_allocclass && zfs_special_devs(nvroot, NULL)) { spa_deactivate(spa); spa_remove(spa); mutex_exit(&spa_namespace_lock); return (ENOTSUP); } if (has_features || nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) { version = SPA_VERSION; } ASSERT(SPA_VERSION_IS_SUPPORTED(version)); spa->spa_first_txg = txg; spa->spa_uberblock.ub_txg = txg - 1; spa->spa_uberblock.ub_version = version; spa->spa_ubsync = spa->spa_uberblock; spa->spa_load_state = SPA_LOAD_CREATE; spa->spa_removing_phys.sr_state = DSS_NONE; spa->spa_removing_phys.sr_removing_vdev = -1; spa->spa_removing_phys.sr_prev_indirect_vdev = -1; spa->spa_indirect_vdevs_loaded = B_TRUE; /* * Create "The Godfather" zio to hold all async IOs */ spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *), KM_SLEEP); for (int i = 0; i < max_ncpus; i++) { spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); } /* * Create the root vdev. */ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); ASSERT(error != 0 || rvd != NULL); ASSERT(error != 0 || spa->spa_root_vdev == rvd); if (error == 0 && !zfs_allocatable_devs(nvroot)) error = SET_ERROR(EINVAL); if (error == 0 && (error = vdev_create(rvd, txg, B_FALSE)) == 0 && (error = vdev_draid_spare_create(nvroot, rvd, &ndraid, 0)) == 0 && (error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) == 0) { /* * instantiate the metaslab groups (this will dirty the vdevs) * we can no longer error exit past this point */ for (int c = 0; error == 0 && c < rvd->vdev_children; c++) { vdev_t *vd = rvd->vdev_child[c]; vdev_metaslab_set_size(vd); vdev_expand(vd, txg); } } spa_config_exit(spa, SCL_ALL, FTAG); if (error != 0) { spa_unload(spa); spa_deactivate(spa); spa_remove(spa); mutex_exit(&spa_namespace_lock); return (error); } /* * Get the list of spares, if specified. */ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0) { spa->spa_spares.sav_config = fnvlist_alloc(); fnvlist_add_nvlist_array(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, (const nvlist_t * const *)spares, nspares); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_load_spares(spa); spa_config_exit(spa, SCL_ALL, FTAG); spa->spa_spares.sav_sync = B_TRUE; } /* * Get the list of level 2 cache devices, if specified. */ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0) { VERIFY0(nvlist_alloc(&spa->spa_l2cache.sav_config, NV_UNIQUE_NAME, KM_SLEEP)); fnvlist_add_nvlist_array(spa->spa_l2cache.sav_config, ZPOOL_CONFIG_L2CACHE, (const nvlist_t * const *)l2cache, nl2cache); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_load_l2cache(spa); spa_config_exit(spa, SCL_ALL, FTAG); spa->spa_l2cache.sav_sync = B_TRUE; } spa->spa_is_initializing = B_TRUE; spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, dcp, txg); spa->spa_is_initializing = B_FALSE; /* * Create DDTs (dedup tables). */ ddt_create(spa); spa_update_dspace(spa); tx = dmu_tx_create_assigned(dp, txg); /* * Create the pool's history object. */ if (version >= SPA_VERSION_ZPOOL_HISTORY && !spa->spa_history) spa_history_create_obj(spa, tx); spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_CREATE); spa_history_log_version(spa, "create", tx); /* * Create the pool config object. */ spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE, DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); if (zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { cmn_err(CE_PANIC, "failed to add pool config"); } if (zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION, sizeof (uint64_t), 1, &version, tx) != 0) { cmn_err(CE_PANIC, "failed to add pool version"); } /* Newly created pools with the right version are always deflated. */ if (version >= SPA_VERSION_RAIDZ_DEFLATE) { spa->spa_deflate = TRUE; if (zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { cmn_err(CE_PANIC, "failed to add deflate"); } } /* * Create the deferred-free bpobj. Turn off compression * because sync-to-convergence takes longer if the blocksize * keeps changing. */ obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx); dmu_object_set_compress(spa->spa_meta_objset, obj, ZIO_COMPRESS_OFF, tx); if (zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ, sizeof (uint64_t), 1, &obj, tx) != 0) { cmn_err(CE_PANIC, "failed to add bpobj"); } VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj)); /* * Generate some random noise for salted checksums to operate on. */ (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes, sizeof (spa->spa_cksum_salt.zcs_bytes)); /* * Set pool properties. */ spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND); spa->spa_multihost = zpool_prop_default_numeric(ZPOOL_PROP_MULTIHOST); spa->spa_autotrim = zpool_prop_default_numeric(ZPOOL_PROP_AUTOTRIM); if (props != NULL) { spa_configfile_set(spa, props, B_FALSE); spa_sync_props(props, tx); } for (int i = 0; i < ndraid; i++) spa_feature_incr(spa, SPA_FEATURE_DRAID, tx); dmu_tx_commit(tx); spa->spa_sync_on = B_TRUE; txg_sync_start(dp); mmp_thread_start(spa); txg_wait_synced(dp, txg); spa_spawn_aux_threads(spa); spa_write_cachefile(spa, B_FALSE, B_TRUE, B_TRUE); /* * Don't count references from objsets that are already closed * and are making their way through the eviction process. */ spa_evicting_os_wait(spa); spa->spa_minref = zfs_refcount_count(&spa->spa_refcount); spa->spa_load_state = SPA_LOAD_NONE; spa_import_os(spa); mutex_exit(&spa_namespace_lock); return (0); } /* * Import a non-root pool into the system. */ int spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) { spa_t *spa; char *altroot = NULL; spa_load_state_t state = SPA_LOAD_IMPORT; zpool_load_policy_t policy; spa_mode_t mode = spa_mode_global; uint64_t readonly = B_FALSE; int error; nvlist_t *nvroot; nvlist_t **spares, **l2cache; uint_t nspares, nl2cache; /* * If a pool with this name exists, return failure. */ mutex_enter(&spa_namespace_lock); if (spa_lookup(pool) != NULL) { mutex_exit(&spa_namespace_lock); return (SET_ERROR(EEXIST)); } /* * Create and initialize the spa structure. */ (void) nvlist_lookup_string(props, zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); (void) nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly); if (readonly) mode = SPA_MODE_READ; spa = spa_add(pool, config, altroot); spa->spa_import_flags = flags; /* * Verbatim import - Take a pool and insert it into the namespace * as if it had been loaded at boot. */ if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) { if (props != NULL) spa_configfile_set(spa, props, B_FALSE); spa_write_cachefile(spa, B_FALSE, B_TRUE, B_FALSE); spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT); zfs_dbgmsg("spa_import: verbatim import of %s", pool); mutex_exit(&spa_namespace_lock); return (0); } spa_activate(spa, mode); /* * Don't start async tasks until we know everything is healthy. */ spa_async_suspend(spa); zpool_get_load_policy(config, &policy); if (policy.zlp_rewind & ZPOOL_DO_REWIND) state = SPA_LOAD_RECOVER; spa->spa_config_source = SPA_CONFIG_SRC_TRYIMPORT; if (state != SPA_LOAD_RECOVER) { spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; zfs_dbgmsg("spa_import: importing %s", pool); } else { zfs_dbgmsg("spa_import: importing %s, max_txg=%lld " "(RECOVERY MODE)", pool, (longlong_t)policy.zlp_txg); } error = spa_load_best(spa, state, policy.zlp_txg, policy.zlp_rewind); /* * Propagate anything learned while loading the pool and pass it * back to caller (i.e. rewind info, missing devices, etc). */ fnvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, spa->spa_load_info); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); /* * Toss any existing sparelist, as it doesn't have any validity * anymore, and conflicts with spa_has_spare(). */ if (spa->spa_spares.sav_config) { nvlist_free(spa->spa_spares.sav_config); spa->spa_spares.sav_config = NULL; spa_load_spares(spa); } if (spa->spa_l2cache.sav_config) { nvlist_free(spa->spa_l2cache.sav_config); spa->spa_l2cache.sav_config = NULL; spa_load_l2cache(spa); } nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); spa_config_exit(spa, SCL_ALL, FTAG); if (props != NULL) spa_configfile_set(spa, props, B_FALSE); if (error != 0 || (props && spa_writeable(spa) && (error = spa_prop_set(spa, props)))) { spa_unload(spa); spa_deactivate(spa); spa_remove(spa); mutex_exit(&spa_namespace_lock); return (error); } spa_async_resume(spa); /* * Override any spares and level 2 cache devices as specified by * the user, as these may have correct device names/devids, etc. */ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0) { if (spa->spa_spares.sav_config) fnvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES); else spa->spa_spares.sav_config = fnvlist_alloc(); fnvlist_add_nvlist_array(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, (const nvlist_t * const *)spares, nspares); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_load_spares(spa); spa_config_exit(spa, SCL_ALL, FTAG); spa->spa_spares.sav_sync = B_TRUE; } if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0) { if (spa->spa_l2cache.sav_config) fnvlist_remove(spa->spa_l2cache.sav_config, ZPOOL_CONFIG_L2CACHE); else spa->spa_l2cache.sav_config = fnvlist_alloc(); fnvlist_add_nvlist_array(spa->spa_l2cache.sav_config, ZPOOL_CONFIG_L2CACHE, (const nvlist_t * const *)l2cache, nl2cache); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_load_l2cache(spa); spa_config_exit(spa, SCL_ALL, FTAG); spa->spa_l2cache.sav_sync = B_TRUE; } /* * Check for any removed devices. */ if (spa->spa_autoreplace) { spa_aux_check_removed(&spa->spa_spares); spa_aux_check_removed(&spa->spa_l2cache); } if (spa_writeable(spa)) { /* * Update the config cache to include the newly-imported pool. */ spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); } /* * It's possible that the pool was expanded while it was exported. * We kick off an async task to handle this for us. */ spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); spa_history_log_version(spa, "import", NULL); spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT); mutex_exit(&spa_namespace_lock); zvol_create_minors_recursive(pool); spa_import_os(spa); return (0); } nvlist_t * spa_tryimport(nvlist_t *tryconfig) { nvlist_t *config = NULL; char *poolname, *cachefile; spa_t *spa; uint64_t state; int error; zpool_load_policy_t policy; if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) return (NULL); if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) return (NULL); /* * Create and initialize the spa structure. */ mutex_enter(&spa_namespace_lock); spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL); spa_activate(spa, SPA_MODE_READ); /* * Rewind pool if a max txg was provided. */ zpool_get_load_policy(spa->spa_config, &policy); if (policy.zlp_txg != UINT64_MAX) { spa->spa_load_max_txg = policy.zlp_txg; spa->spa_extreme_rewind = B_TRUE; zfs_dbgmsg("spa_tryimport: importing %s, max_txg=%lld", poolname, (longlong_t)policy.zlp_txg); } else { zfs_dbgmsg("spa_tryimport: importing %s", poolname); } if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_CACHEFILE, &cachefile) == 0) { zfs_dbgmsg("spa_tryimport: using cachefile '%s'", cachefile); spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE; } else { spa->spa_config_source = SPA_CONFIG_SRC_SCAN; } error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING); /* * If 'tryconfig' was at least parsable, return the current config. */ if (spa->spa_root_vdev != NULL) { config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); fnvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, poolname); fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, state); fnvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, spa->spa_uberblock.ub_timestamp); fnvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, spa->spa_load_info); fnvlist_add_uint64(config, ZPOOL_CONFIG_ERRATA, spa->spa_errata); /* * If the bootfs property exists on this pool then we * copy it out so that external consumers can tell which * pools are bootable. */ if ((!error || error == EEXIST) && spa->spa_bootfs) { char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP); /* * We have to play games with the name since the * pool was opened as TRYIMPORT_NAME. */ if (dsl_dsobj_to_dsname(spa_name(spa), spa->spa_bootfs, tmpname) == 0) { char *cp; char *dsname; dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP); cp = strchr(tmpname, '/'); if (cp == NULL) { (void) strlcpy(dsname, tmpname, MAXPATHLEN); } else { (void) snprintf(dsname, MAXPATHLEN, "%s/%s", poolname, ++cp); } fnvlist_add_string(config, ZPOOL_CONFIG_BOOTFS, dsname); kmem_free(dsname, MAXPATHLEN); } kmem_free(tmpname, MAXPATHLEN); } /* * Add the list of hot spares and level 2 cache devices. */ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); spa_add_spares(spa, config); spa_add_l2cache(spa, config); spa_config_exit(spa, SCL_CONFIG, FTAG); } spa_unload(spa); spa_deactivate(spa); spa_remove(spa); mutex_exit(&spa_namespace_lock); return (config); } /* * Pool export/destroy * * The act of destroying or exporting a pool is very simple. We make sure there * is no more pending I/O and any references to the pool are gone. Then, we * update the pool state and sync all the labels to disk, removing the * configuration from the cache afterwards. If the 'hardforce' flag is set, then * we don't sync the labels or remove the configuration cache. */ static int spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig, boolean_t force, boolean_t hardforce) { int error; spa_t *spa; if (oldconfig) *oldconfig = NULL; if (!(spa_mode_global & SPA_MODE_WRITE)) return (SET_ERROR(EROFS)); mutex_enter(&spa_namespace_lock); if ((spa = spa_lookup(pool)) == NULL) { mutex_exit(&spa_namespace_lock); return (SET_ERROR(ENOENT)); } if (spa->spa_is_exporting) { /* the pool is being exported by another thread */ mutex_exit(&spa_namespace_lock); return (SET_ERROR(ZFS_ERR_EXPORT_IN_PROGRESS)); } spa->spa_is_exporting = B_TRUE; /* * Put a hold on the pool, drop the namespace lock, stop async tasks, * reacquire the namespace lock, and see if we can export. */ spa_open_ref(spa, FTAG); mutex_exit(&spa_namespace_lock); spa_async_suspend(spa); if (spa->spa_zvol_taskq) { zvol_remove_minors(spa, spa_name(spa), B_TRUE); taskq_wait(spa->spa_zvol_taskq); } mutex_enter(&spa_namespace_lock); spa_close(spa, FTAG); if (spa->spa_state == POOL_STATE_UNINITIALIZED) goto export_spa; /* * The pool will be in core if it's openable, in which case we can * modify its state. Objsets may be open only because they're dirty, * so we have to force it to sync before checking spa_refcnt. */ if (spa->spa_sync_on) { txg_wait_synced(spa->spa_dsl_pool, 0); spa_evicting_os_wait(spa); } /* * A pool cannot be exported or destroyed if there are active * references. If we are resetting a pool, allow references by * fault injection handlers. */ if (!spa_refcount_zero(spa) || (spa->spa_inject_ref != 0)) { error = SET_ERROR(EBUSY); goto fail; } if (spa->spa_sync_on) { vdev_t *rvd = spa->spa_root_vdev; /* * A pool cannot be exported if it has an active shared spare. * This is to prevent other pools stealing the active spare * from an exported pool. At user's own will, such pool can * be forcedly exported. */ if (!force && new_state == POOL_STATE_EXPORTED && spa_has_active_shared_spare(spa)) { error = SET_ERROR(EXDEV); goto fail; } /* * We're about to export or destroy this pool. Make sure * we stop all initialization and trim activity here before * we set the spa_final_txg. This will ensure that all * dirty data resulting from the initialization is * committed to disk before we unload the pool. */ vdev_initialize_stop_all(rvd, VDEV_INITIALIZE_ACTIVE); vdev_trim_stop_all(rvd, VDEV_TRIM_ACTIVE); vdev_autotrim_stop_all(spa); vdev_rebuild_stop_all(spa); /* * We want this to be reflected on every label, * so mark them all dirty. spa_unload() will do the * final sync that pushes these changes out. */ if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa->spa_state = new_state; vdev_config_dirty(rvd); spa_config_exit(spa, SCL_ALL, FTAG); } /* * If the log space map feature is enabled and the pool is * getting exported (but not destroyed), we want to spend some * time flushing as many metaslabs as we can in an attempt to * destroy log space maps and save import time. This has to be * done before we set the spa_final_txg, otherwise * spa_sync() -> spa_flush_metaslabs() may dirty the final TXGs. * spa_should_flush_logs_on_unload() should be called after * spa_state has been set to the new_state. */ if (spa_should_flush_logs_on_unload(spa)) spa_unload_log_sm_flush_all(spa); if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa->spa_final_txg = spa_last_synced_txg(spa) + TXG_DEFER_SIZE + 1; spa_config_exit(spa, SCL_ALL, FTAG); } } export_spa: spa_export_os(spa); if (new_state == POOL_STATE_DESTROYED) spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_DESTROY); else if (new_state == POOL_STATE_EXPORTED) spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_EXPORT); if (spa->spa_state != POOL_STATE_UNINITIALIZED) { spa_unload(spa); spa_deactivate(spa); } if (oldconfig && spa->spa_config) *oldconfig = fnvlist_dup(spa->spa_config); if (new_state != POOL_STATE_UNINITIALIZED) { if (!hardforce) spa_write_cachefile(spa, B_TRUE, B_TRUE, B_FALSE); spa_remove(spa); } else { /* * If spa_remove() is not called for this spa_t and * there is any possibility that it can be reused, * we make sure to reset the exporting flag. */ spa->spa_is_exporting = B_FALSE; } mutex_exit(&spa_namespace_lock); return (0); fail: spa->spa_is_exporting = B_FALSE; spa_async_resume(spa); mutex_exit(&spa_namespace_lock); return (error); } /* * Destroy a storage pool. */ int spa_destroy(const char *pool) { return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL, B_FALSE, B_FALSE)); } /* * Export a storage pool. */ int spa_export(const char *pool, nvlist_t **oldconfig, boolean_t force, boolean_t hardforce) { return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig, force, hardforce)); } /* * Similar to spa_export(), this unloads the spa_t without actually removing it * from the namespace in any way. */ int spa_reset(const char *pool) { return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL, B_FALSE, B_FALSE)); } /* * ========================================================================== * Device manipulation * ========================================================================== */ /* * This is called as a synctask to increment the draid feature flag */ static void spa_draid_feature_incr(void *arg, dmu_tx_t *tx) { spa_t *spa = dmu_tx_pool(tx)->dp_spa; int draid = (int)(uintptr_t)arg; for (int c = 0; c < draid; c++) spa_feature_incr(spa, SPA_FEATURE_DRAID, tx); } /* * Add a device to a storage pool. */ int spa_vdev_add(spa_t *spa, nvlist_t *nvroot) { uint64_t txg, ndraid = 0; int error; vdev_t *rvd = spa->spa_root_vdev; vdev_t *vd, *tvd; nvlist_t **spares, **l2cache; uint_t nspares, nl2cache; ASSERT(spa_writeable(spa)); txg = spa_vdev_enter(spa); if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, VDEV_ALLOC_ADD)) != 0) return (spa_vdev_exit(spa, NULL, txg, error)); spa->spa_pending_vdev = vd; /* spa_vdev_exit() will clear this */ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, &nspares) != 0) nspares = 0; if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) != 0) nl2cache = 0; if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0) return (spa_vdev_exit(spa, vd, txg, EINVAL)); if (vd->vdev_children != 0 && (error = vdev_create(vd, txg, B_FALSE)) != 0) { return (spa_vdev_exit(spa, vd, txg, error)); } /* * The virtual dRAID spares must be added after vdev tree is created * and the vdev guids are generated. The guid of their associated * dRAID is stored in the config and used when opening the spare. */ if ((error = vdev_draid_spare_create(nvroot, vd, &ndraid, rvd->vdev_children)) == 0) { if (ndraid > 0 && nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, &nspares) != 0) nspares = 0; } else { return (spa_vdev_exit(spa, vd, txg, error)); } /* * We must validate the spares and l2cache devices after checking the * children. Otherwise, vdev_inuse() will blindly overwrite the spare. */ if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0) return (spa_vdev_exit(spa, vd, txg, error)); /* * If we are in the middle of a device removal, we can only add * devices which match the existing devices in the pool. * If we are in the middle of a removal, or have some indirect * vdevs, we can not add raidz or dRAID top levels. */ if (spa->spa_vdev_removal != NULL || spa->spa_removing_phys.sr_prev_indirect_vdev != -1) { for (int c = 0; c < vd->vdev_children; c++) { tvd = vd->vdev_child[c]; if (spa->spa_vdev_removal != NULL && tvd->vdev_ashift != spa->spa_max_ashift) { return (spa_vdev_exit(spa, vd, txg, EINVAL)); } /* Fail if top level vdev is raidz or a dRAID */ if (vdev_get_nparity(tvd) != 0) return (spa_vdev_exit(spa, vd, txg, EINVAL)); /* * Need the top level mirror to be * a mirror of leaf vdevs only */ if (tvd->vdev_ops == &vdev_mirror_ops) { for (uint64_t cid = 0; cid < tvd->vdev_children; cid++) { vdev_t *cvd = tvd->vdev_child[cid]; if (!cvd->vdev_ops->vdev_op_leaf) { return (spa_vdev_exit(spa, vd, txg, EINVAL)); } } } } } for (int c = 0; c < vd->vdev_children; c++) { tvd = vd->vdev_child[c]; vdev_remove_child(vd, tvd); tvd->vdev_id = rvd->vdev_children; vdev_add_child(rvd, tvd); vdev_config_dirty(tvd); } if (nspares != 0) { spa_set_aux_vdevs(&spa->spa_spares, spares, nspares, ZPOOL_CONFIG_SPARES); spa_load_spares(spa); spa->spa_spares.sav_sync = B_TRUE; } if (nl2cache != 0) { spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache, ZPOOL_CONFIG_L2CACHE); spa_load_l2cache(spa); spa->spa_l2cache.sav_sync = B_TRUE; } /* * We can't increment a feature while holding spa_vdev so we * have to do it in a synctask. */ if (ndraid != 0) { dmu_tx_t *tx; tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); dsl_sync_task_nowait(spa->spa_dsl_pool, spa_draid_feature_incr, (void *)(uintptr_t)ndraid, tx); dmu_tx_commit(tx); } /* * We have to be careful when adding new vdevs to an existing pool. * If other threads start allocating from these vdevs before we * sync the config cache, and we lose power, then upon reboot we may * fail to open the pool because there are DVAs that the config cache * can't translate. Therefore, we first add the vdevs without * initializing metaslabs; sync the config cache (via spa_vdev_exit()); * and then let spa_config_update() initialize the new metaslabs. * * spa_load() checks for added-but-not-initialized vdevs, so that * if we lose power at any point in this sequence, the remaining * steps will be completed the next time we load the pool. */ (void) spa_vdev_exit(spa, vd, txg, 0); mutex_enter(&spa_namespace_lock); spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); spa_event_notify(spa, NULL, NULL, ESC_ZFS_VDEV_ADD); mutex_exit(&spa_namespace_lock); return (0); } /* * Attach a device to a mirror. The arguments are the path to any device * in the mirror, and the nvroot for the new device. If the path specifies * a device that is not mirrored, we automatically insert the mirror vdev. * * If 'replacing' is specified, the new device is intended to replace the * existing device; in this case the two devices are made into their own * mirror using the 'replacing' vdev, which is functionally identical to * the mirror vdev (it actually reuses all the same ops) but has a few * extra rules: you can't attach to it after it's been created, and upon * completion of resilvering, the first disk (the one being replaced) * is automatically detached. * * If 'rebuild' is specified, then sequential reconstruction (a.ka. rebuild) * should be performed instead of traditional healing reconstruction. From * an administrators perspective these are both resilver operations. */ int spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, int rebuild) { uint64_t txg, dtl_max_txg; vdev_t *rvd = spa->spa_root_vdev; vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; vdev_ops_t *pvops; char *oldvdpath, *newvdpath; int newvd_isspare; int error; ASSERT(spa_writeable(spa)); txg = spa_vdev_enter(spa); oldvd = spa_lookup_by_guid(spa, guid, B_FALSE); ASSERT(MUTEX_HELD(&spa_namespace_lock)); if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { error = (spa_has_checkpoint(spa)) ? ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; return (spa_vdev_exit(spa, NULL, txg, error)); } if (rebuild) { if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD)) return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); if (dsl_scan_resilvering(spa_get_dsl(spa))) return (spa_vdev_exit(spa, NULL, txg, ZFS_ERR_RESILVER_IN_PROGRESS)); } else { if (vdev_rebuild_active(rvd)) return (spa_vdev_exit(spa, NULL, txg, ZFS_ERR_REBUILD_IN_PROGRESS)); } if (spa->spa_vdev_removal != NULL) return (spa_vdev_exit(spa, NULL, txg, EBUSY)); if (oldvd == NULL) return (spa_vdev_exit(spa, NULL, txg, ENODEV)); if (!oldvd->vdev_ops->vdev_op_leaf) return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); pvd = oldvd->vdev_parent; - if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, - VDEV_ALLOC_ATTACH)) != 0) + if (spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, + VDEV_ALLOC_ATTACH) != 0) return (spa_vdev_exit(spa, NULL, txg, EINVAL)); if (newrootvd->vdev_children != 1) return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); newvd = newrootvd->vdev_child[0]; if (!newvd->vdev_ops->vdev_op_leaf) return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); if ((error = vdev_create(newrootvd, txg, replacing)) != 0) return (spa_vdev_exit(spa, newrootvd, txg, error)); /* * Spares can't replace logs */ if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare) return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); /* * A dRAID spare can only replace a child of its parent dRAID vdev. */ if (newvd->vdev_ops == &vdev_draid_spare_ops && oldvd->vdev_top != vdev_draid_spare_get_parent(newvd)) { return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); } if (rebuild) { /* * For rebuilds, the top vdev must support reconstruction * using only space maps. This means the only allowable * vdevs types are the root vdev, a mirror, or dRAID. */ tvd = pvd; if (pvd->vdev_top != NULL) tvd = pvd->vdev_top; if (tvd->vdev_ops != &vdev_mirror_ops && tvd->vdev_ops != &vdev_root_ops && tvd->vdev_ops != &vdev_draid_ops) { return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); } } if (!replacing) { /* * For attach, the only allowable parent is a mirror or the root * vdev. */ if (pvd->vdev_ops != &vdev_mirror_ops && pvd->vdev_ops != &vdev_root_ops) return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); pvops = &vdev_mirror_ops; } else { /* * Active hot spares can only be replaced by inactive hot * spares. */ if (pvd->vdev_ops == &vdev_spare_ops && oldvd->vdev_isspare && !spa_has_spare(spa, newvd->vdev_guid)) return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); /* * If the source is a hot spare, and the parent isn't already a * spare, then we want to create a new hot spare. Otherwise, we * want to create a replacing vdev. The user is not allowed to * attach to a spared vdev child unless the 'isspare' state is * the same (spare replaces spare, non-spare replaces * non-spare). */ if (pvd->vdev_ops == &vdev_replacing_ops && spa_version(spa) < SPA_VERSION_MULTI_REPLACE) { return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); } else if (pvd->vdev_ops == &vdev_spare_ops && newvd->vdev_isspare != oldvd->vdev_isspare) { return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); } if (newvd->vdev_isspare) pvops = &vdev_spare_ops; else pvops = &vdev_replacing_ops; } /* * Make sure the new device is big enough. */ if (newvd->vdev_asize < vdev_get_min_asize(oldvd)) return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); /* * The new device cannot have a higher alignment requirement * than the top-level vdev. */ if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); /* * If this is an in-place replacement, update oldvd's path and devid * to make it distinguishable from newvd, and unopenable from now on. */ if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { spa_strfree(oldvd->vdev_path); oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, KM_SLEEP); (void) snprintf(oldvd->vdev_path, strlen(newvd->vdev_path) + 5, "%s/%s", newvd->vdev_path, "old"); if (oldvd->vdev_devid != NULL) { spa_strfree(oldvd->vdev_devid); oldvd->vdev_devid = NULL; } } /* * If the parent is not a mirror, or if we're replacing, insert the new * mirror/replacing/spare vdev above oldvd. */ if (pvd->vdev_ops != pvops) pvd = vdev_add_parent(oldvd, pvops); ASSERT(pvd->vdev_top->vdev_parent == rvd); ASSERT(pvd->vdev_ops == pvops); ASSERT(oldvd->vdev_parent == pvd); /* * Extract the new device from its root and add it to pvd. */ vdev_remove_child(newrootvd, newvd); newvd->vdev_id = pvd->vdev_children; newvd->vdev_crtxg = oldvd->vdev_crtxg; vdev_add_child(pvd, newvd); /* * Reevaluate the parent vdev state. */ vdev_propagate_state(pvd); tvd = newvd->vdev_top; ASSERT(pvd->vdev_top == tvd); ASSERT(tvd->vdev_parent == rvd); vdev_config_dirty(tvd); /* * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account * for any dmu_sync-ed blocks. It will propagate upward when * spa_vdev_exit() calls vdev_dtl_reassess(). */ dtl_max_txg = txg + TXG_CONCURRENT_STATES; vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL, dtl_max_txg - TXG_INITIAL); if (newvd->vdev_isspare) { spa_spare_activate(newvd); spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_SPARE); } oldvdpath = spa_strdup(oldvd->vdev_path); newvdpath = spa_strdup(newvd->vdev_path); newvd_isspare = newvd->vdev_isspare; /* * Mark newvd's DTL dirty in this txg. */ vdev_dirty(tvd, VDD_DTL, newvd, txg); /* * Schedule the resilver or rebuild to restart in the future. We do * this to ensure that dmu_sync-ed blocks have been stitched into the * respective datasets. */ if (rebuild) { newvd->vdev_rebuild_txg = txg; vdev_rebuild(tvd); } else { newvd->vdev_resilver_txg = txg; if (dsl_scan_resilvering(spa_get_dsl(spa)) && spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER)) { vdev_defer_resilver(newvd); } else { dsl_scan_restart_resilver(spa->spa_dsl_pool, dtl_max_txg); } } if (spa->spa_bootfs) spa_event_notify(spa, newvd, NULL, ESC_ZFS_BOOTFS_VDEV_ATTACH); spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_ATTACH); /* * Commit the config */ (void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0); spa_history_log_internal(spa, "vdev attach", NULL, "%s vdev=%s %s vdev=%s", replacing && newvd_isspare ? "spare in" : replacing ? "replace" : "attach", newvdpath, replacing ? "for" : "to", oldvdpath); spa_strfree(oldvdpath); spa_strfree(newvdpath); return (0); } /* * Detach a device from a mirror or replacing vdev. * * If 'replace_done' is specified, only detach if the parent * is a replacing vdev. */ int spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) { uint64_t txg; int error; vdev_t *rvd __maybe_unused = spa->spa_root_vdev; vdev_t *vd, *pvd, *cvd, *tvd; boolean_t unspare = B_FALSE; uint64_t unspare_guid = 0; char *vdpath; ASSERT(spa_writeable(spa)); txg = spa_vdev_detach_enter(spa, guid); vd = spa_lookup_by_guid(spa, guid, B_FALSE); /* * Besides being called directly from the userland through the * ioctl interface, spa_vdev_detach() can be potentially called * at the end of spa_vdev_resilver_done(). * * In the regular case, when we have a checkpoint this shouldn't * happen as we never empty the DTLs of a vdev during the scrub * [see comment in dsl_scan_done()]. Thus spa_vdev_resilvering_done() * should never get here when we have a checkpoint. * * That said, even in a case when we checkpoint the pool exactly * as spa_vdev_resilver_done() calls this function everything * should be fine as the resilver will return right away. */ ASSERT(MUTEX_HELD(&spa_namespace_lock)); if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { error = (spa_has_checkpoint(spa)) ? ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; return (spa_vdev_exit(spa, NULL, txg, error)); } if (vd == NULL) return (spa_vdev_exit(spa, NULL, txg, ENODEV)); if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); pvd = vd->vdev_parent; /* * If the parent/child relationship is not as expected, don't do it. * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing * vdev that's replacing B with C. The user's intent in replacing * is to go from M(A,B) to M(A,C). If the user decides to cancel * the replace by detaching C, the expected behavior is to end up * M(A,B). But suppose that right after deciding to detach C, * the replacement of B completes. We would have M(A,C), and then * ask to detach C, which would leave us with just A -- not what * the user wanted. To prevent this, we make sure that the * parent/child relationship hasn't changed -- in this example, * that C's parent is still the replacing vdev R. */ if (pvd->vdev_guid != pguid && pguid != 0) return (spa_vdev_exit(spa, NULL, txg, EBUSY)); /* * Only 'replacing' or 'spare' vdevs can be replaced. */ if (replace_done && pvd->vdev_ops != &vdev_replacing_ops && pvd->vdev_ops != &vdev_spare_ops) return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); ASSERT(pvd->vdev_ops != &vdev_spare_ops || spa_version(spa) >= SPA_VERSION_SPARES); /* * Only mirror, replacing, and spare vdevs support detach. */ if (pvd->vdev_ops != &vdev_replacing_ops && pvd->vdev_ops != &vdev_mirror_ops && pvd->vdev_ops != &vdev_spare_ops) return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); /* * If this device has the only valid copy of some data, * we cannot safely detach it. */ if (vdev_dtl_required(vd)) return (spa_vdev_exit(spa, NULL, txg, EBUSY)); ASSERT(pvd->vdev_children >= 2); /* * If we are detaching the second disk from a replacing vdev, then * check to see if we changed the original vdev's path to have "/old" * at the end in spa_vdev_attach(). If so, undo that change now. */ if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 && vd->vdev_path != NULL) { size_t len = strlen(vd->vdev_path); for (int c = 0; c < pvd->vdev_children; c++) { cvd = pvd->vdev_child[c]; if (cvd == vd || cvd->vdev_path == NULL) continue; if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 && strcmp(cvd->vdev_path + len, "/old") == 0) { spa_strfree(cvd->vdev_path); cvd->vdev_path = spa_strdup(vd->vdev_path); break; } } } /* * If we are detaching the original disk from a normal spare, then it * implies that the spare should become a real disk, and be removed * from the active spare list for the pool. dRAID spares on the * other hand are coupled to the pool and thus should never be removed * from the spares list. */ if (pvd->vdev_ops == &vdev_spare_ops && vd->vdev_id == 0) { vdev_t *last_cvd = pvd->vdev_child[pvd->vdev_children - 1]; if (last_cvd->vdev_isspare && last_cvd->vdev_ops != &vdev_draid_spare_ops) { unspare = B_TRUE; } } /* * Erase the disk labels so the disk can be used for other things. * This must be done after all other error cases are handled, * but before we disembowel vd (so we can still do I/O to it). * But if we can't do it, don't treat the error as fatal -- * it may be that the unwritability of the disk is the reason * it's being detached! */ - error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); + (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); /* * Remove vd from its parent and compact the parent's children. */ vdev_remove_child(pvd, vd); vdev_compact_children(pvd); /* * Remember one of the remaining children so we can get tvd below. */ cvd = pvd->vdev_child[pvd->vdev_children - 1]; /* * If we need to remove the remaining child from the list of hot spares, * do it now, marking the vdev as no longer a spare in the process. * We must do this before vdev_remove_parent(), because that can * change the GUID if it creates a new toplevel GUID. For a similar * reason, we must remove the spare now, in the same txg as the detach; * otherwise someone could attach a new sibling, change the GUID, and * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail. */ if (unspare) { ASSERT(cvd->vdev_isspare); spa_spare_remove(cvd); unspare_guid = cvd->vdev_guid; (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); cvd->vdev_unspare = B_TRUE; } /* * If the parent mirror/replacing vdev only has one child, * the parent is no longer needed. Remove it from the tree. */ if (pvd->vdev_children == 1) { if (pvd->vdev_ops == &vdev_spare_ops) cvd->vdev_unspare = B_FALSE; vdev_remove_parent(cvd); } /* * We don't set tvd until now because the parent we just removed * may have been the previous top-level vdev. */ tvd = cvd->vdev_top; ASSERT(tvd->vdev_parent == rvd); /* * Reevaluate the parent vdev state. */ vdev_propagate_state(cvd); /* * If the 'autoexpand' property is set on the pool then automatically * try to expand the size of the pool. For example if the device we * just detached was smaller than the others, it may be possible to * add metaslabs (i.e. grow the pool). We need to reopen the vdev * first so that we can obtain the updated sizes of the leaf vdevs. */ if (spa->spa_autoexpand) { vdev_reopen(tvd); vdev_expand(tvd, txg); } vdev_config_dirty(tvd); /* * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that * vd->vdev_detached is set and free vd's DTL object in syncing context. * But first make sure we're not on any *other* txg's DTL list, to * prevent vd from being accessed after it's freed. */ vdpath = spa_strdup(vd->vdev_path ? vd->vdev_path : "none"); for (int t = 0; t < TXG_SIZE; t++) (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); vd->vdev_detached = B_TRUE; vdev_dirty(tvd, VDD_DTL, vd, txg); spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE); spa_notify_waiters(spa); /* hang on to the spa before we release the lock */ spa_open_ref(spa, FTAG); error = spa_vdev_exit(spa, vd, txg, 0); spa_history_log_internal(spa, "detach", NULL, "vdev=%s", vdpath); spa_strfree(vdpath); /* * If this was the removal of the original device in a hot spare vdev, * then we want to go through and remove the device from the hot spare * list of every other pool. */ if (unspare) { spa_t *altspa = NULL; mutex_enter(&spa_namespace_lock); while ((altspa = spa_next(altspa)) != NULL) { if (altspa->spa_state != POOL_STATE_ACTIVE || altspa == spa) continue; spa_open_ref(altspa, FTAG); mutex_exit(&spa_namespace_lock); (void) spa_vdev_remove(altspa, unspare_guid, B_TRUE); mutex_enter(&spa_namespace_lock); spa_close(altspa, FTAG); } mutex_exit(&spa_namespace_lock); /* search the rest of the vdevs for spares to remove */ spa_vdev_resilver_done(spa); } /* all done with the spa; OK to release */ mutex_enter(&spa_namespace_lock); spa_close(spa, FTAG); mutex_exit(&spa_namespace_lock); return (error); } static int spa_vdev_initialize_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type, list_t *vd_list) { ASSERT(MUTEX_HELD(&spa_namespace_lock)); spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); /* Look up vdev and ensure it's a leaf. */ vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE); if (vd == NULL || vd->vdev_detached) { spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); return (SET_ERROR(ENODEV)); } else if (!vd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(vd)) { spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); return (SET_ERROR(EINVAL)); } else if (!vdev_writeable(vd)) { spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); return (SET_ERROR(EROFS)); } mutex_enter(&vd->vdev_initialize_lock); spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); /* * When we activate an initialize action we check to see * if the vdev_initialize_thread is NULL. We do this instead * of using the vdev_initialize_state since there might be * a previous initialization process which has completed but * the thread is not exited. */ if (cmd_type == POOL_INITIALIZE_START && (vd->vdev_initialize_thread != NULL || vd->vdev_top->vdev_removing)) { mutex_exit(&vd->vdev_initialize_lock); return (SET_ERROR(EBUSY)); } else if (cmd_type == POOL_INITIALIZE_CANCEL && (vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE && vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED)) { mutex_exit(&vd->vdev_initialize_lock); return (SET_ERROR(ESRCH)); } else if (cmd_type == POOL_INITIALIZE_SUSPEND && vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE) { mutex_exit(&vd->vdev_initialize_lock); return (SET_ERROR(ESRCH)); } switch (cmd_type) { case POOL_INITIALIZE_START: vdev_initialize(vd); break; case POOL_INITIALIZE_CANCEL: vdev_initialize_stop(vd, VDEV_INITIALIZE_CANCELED, vd_list); break; case POOL_INITIALIZE_SUSPEND: vdev_initialize_stop(vd, VDEV_INITIALIZE_SUSPENDED, vd_list); break; default: panic("invalid cmd_type %llu", (unsigned long long)cmd_type); } mutex_exit(&vd->vdev_initialize_lock); return (0); } int spa_vdev_initialize(spa_t *spa, nvlist_t *nv, uint64_t cmd_type, nvlist_t *vdev_errlist) { int total_errors = 0; list_t vd_list; list_create(&vd_list, sizeof (vdev_t), offsetof(vdev_t, vdev_initialize_node)); /* * We hold the namespace lock through the whole function * to prevent any changes to the pool while we're starting or * stopping initialization. The config and state locks are held so that * we can properly assess the vdev state before we commit to * the initializing operation. */ mutex_enter(&spa_namespace_lock); for (nvpair_t *pair = nvlist_next_nvpair(nv, NULL); pair != NULL; pair = nvlist_next_nvpair(nv, pair)) { uint64_t vdev_guid = fnvpair_value_uint64(pair); int error = spa_vdev_initialize_impl(spa, vdev_guid, cmd_type, &vd_list); if (error != 0) { char guid_as_str[MAXNAMELEN]; (void) snprintf(guid_as_str, sizeof (guid_as_str), "%llu", (unsigned long long)vdev_guid); fnvlist_add_int64(vdev_errlist, guid_as_str, error); total_errors++; } } /* Wait for all initialize threads to stop. */ vdev_initialize_stop_wait(spa, &vd_list); /* Sync out the initializing state */ txg_wait_synced(spa->spa_dsl_pool, 0); mutex_exit(&spa_namespace_lock); list_destroy(&vd_list); return (total_errors); } static int spa_vdev_trim_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type, uint64_t rate, boolean_t partial, boolean_t secure, list_t *vd_list) { ASSERT(MUTEX_HELD(&spa_namespace_lock)); spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); /* Look up vdev and ensure it's a leaf. */ vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE); if (vd == NULL || vd->vdev_detached) { spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); return (SET_ERROR(ENODEV)); } else if (!vd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(vd)) { spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); return (SET_ERROR(EINVAL)); } else if (!vdev_writeable(vd)) { spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); return (SET_ERROR(EROFS)); } else if (!vd->vdev_has_trim) { spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); return (SET_ERROR(EOPNOTSUPP)); } else if (secure && !vd->vdev_has_securetrim) { spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); return (SET_ERROR(EOPNOTSUPP)); } mutex_enter(&vd->vdev_trim_lock); spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); /* * When we activate a TRIM action we check to see if the * vdev_trim_thread is NULL. We do this instead of using the * vdev_trim_state since there might be a previous TRIM process * which has completed but the thread is not exited. */ if (cmd_type == POOL_TRIM_START && (vd->vdev_trim_thread != NULL || vd->vdev_top->vdev_removing)) { mutex_exit(&vd->vdev_trim_lock); return (SET_ERROR(EBUSY)); } else if (cmd_type == POOL_TRIM_CANCEL && (vd->vdev_trim_state != VDEV_TRIM_ACTIVE && vd->vdev_trim_state != VDEV_TRIM_SUSPENDED)) { mutex_exit(&vd->vdev_trim_lock); return (SET_ERROR(ESRCH)); } else if (cmd_type == POOL_TRIM_SUSPEND && vd->vdev_trim_state != VDEV_TRIM_ACTIVE) { mutex_exit(&vd->vdev_trim_lock); return (SET_ERROR(ESRCH)); } switch (cmd_type) { case POOL_TRIM_START: vdev_trim(vd, rate, partial, secure); break; case POOL_TRIM_CANCEL: vdev_trim_stop(vd, VDEV_TRIM_CANCELED, vd_list); break; case POOL_TRIM_SUSPEND: vdev_trim_stop(vd, VDEV_TRIM_SUSPENDED, vd_list); break; default: panic("invalid cmd_type %llu", (unsigned long long)cmd_type); } mutex_exit(&vd->vdev_trim_lock); return (0); } /* * Initiates a manual TRIM for the requested vdevs. This kicks off individual * TRIM threads for each child vdev. These threads pass over all of the free * space in the vdev's metaslabs and issues TRIM commands for that space. */ int spa_vdev_trim(spa_t *spa, nvlist_t *nv, uint64_t cmd_type, uint64_t rate, boolean_t partial, boolean_t secure, nvlist_t *vdev_errlist) { int total_errors = 0; list_t vd_list; list_create(&vd_list, sizeof (vdev_t), offsetof(vdev_t, vdev_trim_node)); /* * We hold the namespace lock through the whole function * to prevent any changes to the pool while we're starting or * stopping TRIM. The config and state locks are held so that * we can properly assess the vdev state before we commit to * the TRIM operation. */ mutex_enter(&spa_namespace_lock); for (nvpair_t *pair = nvlist_next_nvpair(nv, NULL); pair != NULL; pair = nvlist_next_nvpair(nv, pair)) { uint64_t vdev_guid = fnvpair_value_uint64(pair); int error = spa_vdev_trim_impl(spa, vdev_guid, cmd_type, rate, partial, secure, &vd_list); if (error != 0) { char guid_as_str[MAXNAMELEN]; (void) snprintf(guid_as_str, sizeof (guid_as_str), "%llu", (unsigned long long)vdev_guid); fnvlist_add_int64(vdev_errlist, guid_as_str, error); total_errors++; } } /* Wait for all TRIM threads to stop. */ vdev_trim_stop_wait(spa, &vd_list); /* Sync out the TRIM state */ txg_wait_synced(spa->spa_dsl_pool, 0); mutex_exit(&spa_namespace_lock); list_destroy(&vd_list); return (total_errors); } /* * Split a set of devices from their mirrors, and create a new pool from them. */ int spa_vdev_split_mirror(spa_t *spa, const char *newname, nvlist_t *config, nvlist_t *props, boolean_t exp) { int error = 0; uint64_t txg, *glist; spa_t *newspa; uint_t c, children, lastlog; nvlist_t **child, *nvl, *tmp; dmu_tx_t *tx; char *altroot = NULL; vdev_t *rvd, **vml = NULL; /* vdev modify list */ boolean_t activate_slog; ASSERT(spa_writeable(spa)); txg = spa_vdev_enter(spa); ASSERT(MUTEX_HELD(&spa_namespace_lock)); if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { error = (spa_has_checkpoint(spa)) ? ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; return (spa_vdev_exit(spa, NULL, txg, error)); } /* clear the log and flush everything up to now */ activate_slog = spa_passivate_log(spa); (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); error = spa_reset_logs(spa); txg = spa_vdev_config_enter(spa); if (activate_slog) spa_activate_log(spa); if (error != 0) return (spa_vdev_exit(spa, NULL, txg, error)); /* check new spa name before going any further */ if (spa_lookup(newname) != NULL) return (spa_vdev_exit(spa, NULL, txg, EEXIST)); /* * scan through all the children to ensure they're all mirrors */ if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 || nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) return (spa_vdev_exit(spa, NULL, txg, EINVAL)); /* first, check to ensure we've got the right child count */ rvd = spa->spa_root_vdev; lastlog = 0; for (c = 0; c < rvd->vdev_children; c++) { vdev_t *vd = rvd->vdev_child[c]; /* don't count the holes & logs as children */ if (vd->vdev_islog || (vd->vdev_ops != &vdev_indirect_ops && !vdev_is_concrete(vd))) { if (lastlog == 0) lastlog = c; continue; } lastlog = 0; } if (children != (lastlog != 0 ? lastlog : rvd->vdev_children)) return (spa_vdev_exit(spa, NULL, txg, EINVAL)); /* next, ensure no spare or cache devices are part of the split */ if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 || nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0) return (spa_vdev_exit(spa, NULL, txg, EINVAL)); vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP); glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP); /* then, loop over each vdev and validate it */ for (c = 0; c < children; c++) { uint64_t is_hole = 0; (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, &is_hole); if (is_hole != 0) { if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole || spa->spa_root_vdev->vdev_child[c]->vdev_islog) { continue; } else { error = SET_ERROR(EINVAL); break; } } /* deal with indirect vdevs */ if (spa->spa_root_vdev->vdev_child[c]->vdev_ops == &vdev_indirect_ops) continue; /* which disk is going to be split? */ if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID, &glist[c]) != 0) { error = SET_ERROR(EINVAL); break; } /* look it up in the spa */ vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE); if (vml[c] == NULL) { error = SET_ERROR(ENODEV); break; } /* make sure there's nothing stopping the split */ if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops || vml[c]->vdev_islog || !vdev_is_concrete(vml[c]) || vml[c]->vdev_isspare || vml[c]->vdev_isl2cache || !vdev_writeable(vml[c]) || vml[c]->vdev_children != 0 || vml[c]->vdev_state != VDEV_STATE_HEALTHY || c != spa->spa_root_vdev->vdev_child[c]->vdev_id) { error = SET_ERROR(EINVAL); break; } if (vdev_dtl_required(vml[c]) || vdev_resilver_needed(vml[c], NULL, NULL)) { error = SET_ERROR(EBUSY); break; } /* we need certain info from the top level */ fnvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY, vml[c]->vdev_top->vdev_ms_array); fnvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT, vml[c]->vdev_top->vdev_ms_shift); fnvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE, vml[c]->vdev_top->vdev_asize); fnvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT, vml[c]->vdev_top->vdev_ashift); /* transfer per-vdev ZAPs */ ASSERT3U(vml[c]->vdev_leaf_zap, !=, 0); VERIFY0(nvlist_add_uint64(child[c], ZPOOL_CONFIG_VDEV_LEAF_ZAP, vml[c]->vdev_leaf_zap)); ASSERT3U(vml[c]->vdev_top->vdev_top_zap, !=, 0); VERIFY0(nvlist_add_uint64(child[c], ZPOOL_CONFIG_VDEV_TOP_ZAP, vml[c]->vdev_parent->vdev_top_zap)); } if (error != 0) { kmem_free(vml, children * sizeof (vdev_t *)); kmem_free(glist, children * sizeof (uint64_t)); return (spa_vdev_exit(spa, NULL, txg, error)); } /* stop writers from using the disks */ for (c = 0; c < children; c++) { if (vml[c] != NULL) vml[c]->vdev_offline = B_TRUE; } vdev_reopen(spa->spa_root_vdev); /* * Temporarily record the splitting vdevs in the spa config. This * will disappear once the config is regenerated. */ nvl = fnvlist_alloc(); fnvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, glist, children); kmem_free(glist, children * sizeof (uint64_t)); mutex_enter(&spa->spa_props_lock); fnvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT, nvl); mutex_exit(&spa->spa_props_lock); spa->spa_config_splitting = nvl; vdev_config_dirty(spa->spa_root_vdev); /* configure and create the new pool */ fnvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname); fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE); fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, spa_version(spa)); fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, spa->spa_config_txg); fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, spa_generate_guid(NULL)); VERIFY0(nvlist_add_boolean(config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)); (void) nvlist_lookup_string(props, zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); /* add the new pool to the namespace */ newspa = spa_add(newname, config, altroot); newspa->spa_avz_action = AVZ_ACTION_REBUILD; newspa->spa_config_txg = spa->spa_config_txg; spa_set_log_state(newspa, SPA_LOG_CLEAR); /* release the spa config lock, retaining the namespace lock */ spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); if (zio_injection_enabled) zio_handle_panic_injection(spa, FTAG, 1); spa_activate(newspa, spa_mode_global); spa_async_suspend(newspa); /* * Temporarily stop the initializing and TRIM activity. We set the * state to ACTIVE so that we know to resume initializing or TRIM * once the split has completed. */ list_t vd_initialize_list; list_create(&vd_initialize_list, sizeof (vdev_t), offsetof(vdev_t, vdev_initialize_node)); list_t vd_trim_list; list_create(&vd_trim_list, sizeof (vdev_t), offsetof(vdev_t, vdev_trim_node)); for (c = 0; c < children; c++) { if (vml[c] != NULL && vml[c]->vdev_ops != &vdev_indirect_ops) { mutex_enter(&vml[c]->vdev_initialize_lock); vdev_initialize_stop(vml[c], VDEV_INITIALIZE_ACTIVE, &vd_initialize_list); mutex_exit(&vml[c]->vdev_initialize_lock); mutex_enter(&vml[c]->vdev_trim_lock); vdev_trim_stop(vml[c], VDEV_TRIM_ACTIVE, &vd_trim_list); mutex_exit(&vml[c]->vdev_trim_lock); } } vdev_initialize_stop_wait(spa, &vd_initialize_list); vdev_trim_stop_wait(spa, &vd_trim_list); list_destroy(&vd_initialize_list); list_destroy(&vd_trim_list); newspa->spa_config_source = SPA_CONFIG_SRC_SPLIT; newspa->spa_is_splitting = B_TRUE; /* create the new pool from the disks of the original pool */ error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE); if (error) goto out; /* if that worked, generate a real config for the new pool */ if (newspa->spa_root_vdev != NULL) { newspa->spa_config_splitting = fnvlist_alloc(); fnvlist_add_uint64(newspa->spa_config_splitting, ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)); spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL, B_TRUE)); } /* set the props */ if (props != NULL) { spa_configfile_set(newspa, props, B_FALSE); error = spa_prop_set(newspa, props); if (error) goto out; } /* flush everything */ txg = spa_vdev_config_enter(newspa); vdev_config_dirty(newspa->spa_root_vdev); (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG); if (zio_injection_enabled) zio_handle_panic_injection(spa, FTAG, 2); spa_async_resume(newspa); /* finally, update the original pool's config */ txg = spa_vdev_config_enter(spa); tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); error = dmu_tx_assign(tx, TXG_WAIT); if (error != 0) dmu_tx_abort(tx); for (c = 0; c < children; c++) { if (vml[c] != NULL && vml[c]->vdev_ops != &vdev_indirect_ops) { vdev_t *tvd = vml[c]->vdev_top; /* * Need to be sure the detachable VDEV is not * on any *other* txg's DTL list to prevent it * from being accessed after it's freed. */ for (int t = 0; t < TXG_SIZE; t++) { (void) txg_list_remove_this( &tvd->vdev_dtl_list, vml[c], t); } vdev_split(vml[c]); if (error == 0) spa_history_log_internal(spa, "detach", tx, "vdev=%s", vml[c]->vdev_path); vdev_free(vml[c]); } } spa->spa_avz_action = AVZ_ACTION_REBUILD; vdev_config_dirty(spa->spa_root_vdev); spa->spa_config_splitting = NULL; nvlist_free(nvl); if (error == 0) dmu_tx_commit(tx); (void) spa_vdev_exit(spa, NULL, txg, 0); if (zio_injection_enabled) zio_handle_panic_injection(spa, FTAG, 3); /* split is complete; log a history record */ spa_history_log_internal(newspa, "split", NULL, "from pool %s", spa_name(spa)); newspa->spa_is_splitting = B_FALSE; kmem_free(vml, children * sizeof (vdev_t *)); /* if we're not going to mount the filesystems in userland, export */ if (exp) error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL, B_FALSE, B_FALSE); return (error); out: spa_unload(newspa); spa_deactivate(newspa); spa_remove(newspa); txg = spa_vdev_config_enter(spa); /* re-online all offlined disks */ for (c = 0; c < children; c++) { if (vml[c] != NULL) vml[c]->vdev_offline = B_FALSE; } /* restart initializing or trimming disks as necessary */ spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART); spa_async_request(spa, SPA_ASYNC_TRIM_RESTART); spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART); vdev_reopen(spa->spa_root_vdev); nvlist_free(spa->spa_config_splitting); spa->spa_config_splitting = NULL; (void) spa_vdev_exit(spa, NULL, txg, error); kmem_free(vml, children * sizeof (vdev_t *)); return (error); } /* * Find any device that's done replacing, or a vdev marked 'unspare' that's * currently spared, so we can detach it. */ static vdev_t * spa_vdev_resilver_done_hunt(vdev_t *vd) { vdev_t *newvd, *oldvd; for (int c = 0; c < vd->vdev_children; c++) { oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); if (oldvd != NULL) return (oldvd); } /* * Check for a completed replacement. We always consider the first * vdev in the list to be the oldest vdev, and the last one to be * the newest (see spa_vdev_attach() for how that works). In * the case where the newest vdev is faulted, we will not automatically * remove it after a resilver completes. This is OK as it will require * user intervention to determine which disk the admin wishes to keep. */ if (vd->vdev_ops == &vdev_replacing_ops) { ASSERT(vd->vdev_children > 1); newvd = vd->vdev_child[vd->vdev_children - 1]; oldvd = vd->vdev_child[0]; if (vdev_dtl_empty(newvd, DTL_MISSING) && vdev_dtl_empty(newvd, DTL_OUTAGE) && !vdev_dtl_required(oldvd)) return (oldvd); } /* * Check for a completed resilver with the 'unspare' flag set. * Also potentially update faulted state. */ if (vd->vdev_ops == &vdev_spare_ops) { vdev_t *first = vd->vdev_child[0]; vdev_t *last = vd->vdev_child[vd->vdev_children - 1]; if (last->vdev_unspare) { oldvd = first; newvd = last; } else if (first->vdev_unspare) { oldvd = last; newvd = first; } else { oldvd = NULL; } if (oldvd != NULL && vdev_dtl_empty(newvd, DTL_MISSING) && vdev_dtl_empty(newvd, DTL_OUTAGE) && !vdev_dtl_required(oldvd)) return (oldvd); vdev_propagate_state(vd); /* * If there are more than two spares attached to a disk, * and those spares are not required, then we want to * attempt to free them up now so that they can be used * by other pools. Once we're back down to a single * disk+spare, we stop removing them. */ if (vd->vdev_children > 2) { newvd = vd->vdev_child[1]; if (newvd->vdev_isspare && last->vdev_isspare && vdev_dtl_empty(last, DTL_MISSING) && vdev_dtl_empty(last, DTL_OUTAGE) && !vdev_dtl_required(newvd)) return (newvd); } } return (NULL); } static void spa_vdev_resilver_done(spa_t *spa) { vdev_t *vd, *pvd, *ppvd; uint64_t guid, sguid, pguid, ppguid; spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { pvd = vd->vdev_parent; ppvd = pvd->vdev_parent; guid = vd->vdev_guid; pguid = pvd->vdev_guid; ppguid = ppvd->vdev_guid; sguid = 0; /* * If we have just finished replacing a hot spared device, then * we need to detach the parent's first child (the original hot * spare) as well. */ if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 && ppvd->vdev_children == 2) { ASSERT(pvd->vdev_ops == &vdev_replacing_ops); sguid = ppvd->vdev_child[1]->vdev_guid; } ASSERT(vd->vdev_resilver_txg == 0 || !vdev_dtl_required(vd)); spa_config_exit(spa, SCL_ALL, FTAG); if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0) return; if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0) return; spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); } spa_config_exit(spa, SCL_ALL, FTAG); /* * If a detach was not performed above replace waiters will not have * been notified. In which case we must do so now. */ spa_notify_waiters(spa); } /* * Update the stored path or FRU for this vdev. */ static int spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value, boolean_t ispath) { vdev_t *vd; boolean_t sync = B_FALSE; ASSERT(spa_writeable(spa)); spa_vdev_state_enter(spa, SCL_ALL); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, ENOENT)); if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); if (ispath) { if (strcmp(value, vd->vdev_path) != 0) { spa_strfree(vd->vdev_path); vd->vdev_path = spa_strdup(value); sync = B_TRUE; } } else { if (vd->vdev_fru == NULL) { vd->vdev_fru = spa_strdup(value); sync = B_TRUE; } else if (strcmp(value, vd->vdev_fru) != 0) { spa_strfree(vd->vdev_fru); vd->vdev_fru = spa_strdup(value); sync = B_TRUE; } } return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0)); } int spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) { return (spa_vdev_set_common(spa, guid, newpath, B_TRUE)); } int spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru) { return (spa_vdev_set_common(spa, guid, newfru, B_FALSE)); } /* * ========================================================================== * SPA Scanning * ========================================================================== */ int spa_scrub_pause_resume(spa_t *spa, pool_scrub_cmd_t cmd) { ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); if (dsl_scan_resilvering(spa->spa_dsl_pool)) return (SET_ERROR(EBUSY)); return (dsl_scrub_set_pause_resume(spa->spa_dsl_pool, cmd)); } int spa_scan_stop(spa_t *spa) { ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); if (dsl_scan_resilvering(spa->spa_dsl_pool)) return (SET_ERROR(EBUSY)); return (dsl_scan_cancel(spa->spa_dsl_pool)); } int spa_scan(spa_t *spa, pool_scan_func_t func) { ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE) return (SET_ERROR(ENOTSUP)); if (func == POOL_SCAN_RESILVER && !spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER)) return (SET_ERROR(ENOTSUP)); /* * If a resilver was requested, but there is no DTL on a * writeable leaf device, we have nothing to do. */ if (func == POOL_SCAN_RESILVER && !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); return (0); } return (dsl_scan(spa->spa_dsl_pool, func)); } /* * ========================================================================== * SPA async task processing * ========================================================================== */ static void spa_async_remove(spa_t *spa, vdev_t *vd) { if (vd->vdev_remove_wanted) { vd->vdev_remove_wanted = B_FALSE; vd->vdev_delayed_close = B_FALSE; vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); /* * We want to clear the stats, but we don't want to do a full * vdev_clear() as that will cause us to throw away * degraded/faulted state as well as attempt to reopen the * device, all of which is a waste. */ vd->vdev_stat.vs_read_errors = 0; vd->vdev_stat.vs_write_errors = 0; vd->vdev_stat.vs_checksum_errors = 0; vdev_state_dirty(vd->vdev_top); /* Tell userspace that the vdev is gone. */ zfs_post_remove(spa, vd); } for (int c = 0; c < vd->vdev_children; c++) spa_async_remove(spa, vd->vdev_child[c]); } static void spa_async_probe(spa_t *spa, vdev_t *vd) { if (vd->vdev_probe_wanted) { vd->vdev_probe_wanted = B_FALSE; vdev_reopen(vd); /* vdev_open() does the actual probe */ } for (int c = 0; c < vd->vdev_children; c++) spa_async_probe(spa, vd->vdev_child[c]); } static void spa_async_autoexpand(spa_t *spa, vdev_t *vd) { if (!spa->spa_autoexpand) return; for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; spa_async_autoexpand(spa, cvd); } if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL) return; spa_event_notify(vd->vdev_spa, vd, NULL, ESC_ZFS_VDEV_AUTOEXPAND); } static __attribute__((noreturn)) void spa_async_thread(void *arg) { spa_t *spa = (spa_t *)arg; dsl_pool_t *dp = spa->spa_dsl_pool; int tasks; ASSERT(spa->spa_sync_on); mutex_enter(&spa->spa_async_lock); tasks = spa->spa_async_tasks; spa->spa_async_tasks = 0; mutex_exit(&spa->spa_async_lock); /* * See if the config needs to be updated. */ if (tasks & SPA_ASYNC_CONFIG_UPDATE) { uint64_t old_space, new_space; mutex_enter(&spa_namespace_lock); old_space = metaslab_class_get_space(spa_normal_class(spa)); old_space += metaslab_class_get_space(spa_special_class(spa)); old_space += metaslab_class_get_space(spa_dedup_class(spa)); old_space += metaslab_class_get_space( spa_embedded_log_class(spa)); spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); new_space = metaslab_class_get_space(spa_normal_class(spa)); new_space += metaslab_class_get_space(spa_special_class(spa)); new_space += metaslab_class_get_space(spa_dedup_class(spa)); new_space += metaslab_class_get_space( spa_embedded_log_class(spa)); mutex_exit(&spa_namespace_lock); /* * If the pool grew as a result of the config update, * then log an internal history event. */ if (new_space != old_space) { spa_history_log_internal(spa, "vdev online", NULL, "pool '%s' size: %llu(+%llu)", spa_name(spa), (u_longlong_t)new_space, (u_longlong_t)(new_space - old_space)); } } /* * See if any devices need to be marked REMOVED. */ if (tasks & SPA_ASYNC_REMOVE) { spa_vdev_state_enter(spa, SCL_NONE); spa_async_remove(spa, spa->spa_root_vdev); for (int i = 0; i < spa->spa_l2cache.sav_count; i++) spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]); for (int i = 0; i < spa->spa_spares.sav_count; i++) spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]); (void) spa_vdev_state_exit(spa, NULL, 0); } if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) { spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); spa_async_autoexpand(spa, spa->spa_root_vdev); spa_config_exit(spa, SCL_CONFIG, FTAG); } /* * See if any devices need to be probed. */ if (tasks & SPA_ASYNC_PROBE) { spa_vdev_state_enter(spa, SCL_NONE); spa_async_probe(spa, spa->spa_root_vdev); (void) spa_vdev_state_exit(spa, NULL, 0); } /* * If any devices are done replacing, detach them. */ if (tasks & SPA_ASYNC_RESILVER_DONE || tasks & SPA_ASYNC_REBUILD_DONE) { spa_vdev_resilver_done(spa); } /* * Kick off a resilver. */ if (tasks & SPA_ASYNC_RESILVER && !vdev_rebuild_active(spa->spa_root_vdev) && (!dsl_scan_resilvering(dp) || !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_RESILVER_DEFER))) dsl_scan_restart_resilver(dp, 0); if (tasks & SPA_ASYNC_INITIALIZE_RESTART) { mutex_enter(&spa_namespace_lock); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); vdev_initialize_restart(spa->spa_root_vdev); spa_config_exit(spa, SCL_CONFIG, FTAG); mutex_exit(&spa_namespace_lock); } if (tasks & SPA_ASYNC_TRIM_RESTART) { mutex_enter(&spa_namespace_lock); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); vdev_trim_restart(spa->spa_root_vdev); spa_config_exit(spa, SCL_CONFIG, FTAG); mutex_exit(&spa_namespace_lock); } if (tasks & SPA_ASYNC_AUTOTRIM_RESTART) { mutex_enter(&spa_namespace_lock); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); vdev_autotrim_restart(spa); spa_config_exit(spa, SCL_CONFIG, FTAG); mutex_exit(&spa_namespace_lock); } /* * Kick off L2 cache whole device TRIM. */ if (tasks & SPA_ASYNC_L2CACHE_TRIM) { mutex_enter(&spa_namespace_lock); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); vdev_trim_l2arc(spa); spa_config_exit(spa, SCL_CONFIG, FTAG); mutex_exit(&spa_namespace_lock); } /* * Kick off L2 cache rebuilding. */ if (tasks & SPA_ASYNC_L2CACHE_REBUILD) { mutex_enter(&spa_namespace_lock); spa_config_enter(spa, SCL_L2ARC, FTAG, RW_READER); l2arc_spa_rebuild_start(spa); spa_config_exit(spa, SCL_L2ARC, FTAG); mutex_exit(&spa_namespace_lock); } /* * Let the world know that we're done. */ mutex_enter(&spa->spa_async_lock); spa->spa_async_thread = NULL; cv_broadcast(&spa->spa_async_cv); mutex_exit(&spa->spa_async_lock); thread_exit(); } void spa_async_suspend(spa_t *spa) { mutex_enter(&spa->spa_async_lock); spa->spa_async_suspended++; while (spa->spa_async_thread != NULL) cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); mutex_exit(&spa->spa_async_lock); spa_vdev_remove_suspend(spa); zthr_t *condense_thread = spa->spa_condense_zthr; if (condense_thread != NULL) zthr_cancel(condense_thread); zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr; if (discard_thread != NULL) zthr_cancel(discard_thread); zthr_t *ll_delete_thread = spa->spa_livelist_delete_zthr; if (ll_delete_thread != NULL) zthr_cancel(ll_delete_thread); zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr; if (ll_condense_thread != NULL) zthr_cancel(ll_condense_thread); } void spa_async_resume(spa_t *spa) { mutex_enter(&spa->spa_async_lock); ASSERT(spa->spa_async_suspended != 0); spa->spa_async_suspended--; mutex_exit(&spa->spa_async_lock); spa_restart_removal(spa); zthr_t *condense_thread = spa->spa_condense_zthr; if (condense_thread != NULL) zthr_resume(condense_thread); zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr; if (discard_thread != NULL) zthr_resume(discard_thread); zthr_t *ll_delete_thread = spa->spa_livelist_delete_zthr; if (ll_delete_thread != NULL) zthr_resume(ll_delete_thread); zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr; if (ll_condense_thread != NULL) zthr_resume(ll_condense_thread); } static boolean_t spa_async_tasks_pending(spa_t *spa) { uint_t non_config_tasks; uint_t config_task; boolean_t config_task_suspended; non_config_tasks = spa->spa_async_tasks & ~SPA_ASYNC_CONFIG_UPDATE; config_task = spa->spa_async_tasks & SPA_ASYNC_CONFIG_UPDATE; if (spa->spa_ccw_fail_time == 0) { config_task_suspended = B_FALSE; } else { config_task_suspended = (gethrtime() - spa->spa_ccw_fail_time) < ((hrtime_t)zfs_ccw_retry_interval * NANOSEC); } return (non_config_tasks || (config_task && !config_task_suspended)); } static void spa_async_dispatch(spa_t *spa) { mutex_enter(&spa->spa_async_lock); if (spa_async_tasks_pending(spa) && !spa->spa_async_suspended && spa->spa_async_thread == NULL) spa->spa_async_thread = thread_create(NULL, 0, spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); mutex_exit(&spa->spa_async_lock); } void spa_async_request(spa_t *spa, int task) { zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task); mutex_enter(&spa->spa_async_lock); spa->spa_async_tasks |= task; mutex_exit(&spa->spa_async_lock); } int spa_async_tasks(spa_t *spa) { return (spa->spa_async_tasks); } /* * ========================================================================== * SPA syncing routines * ========================================================================== */ static int bpobj_enqueue_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) { bpobj_t *bpo = arg; bpobj_enqueue(bpo, bp, bp_freed, tx); return (0); } int bpobj_enqueue_alloc_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) { return (bpobj_enqueue_cb(arg, bp, B_FALSE, tx)); } int bpobj_enqueue_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) { return (bpobj_enqueue_cb(arg, bp, B_TRUE, tx)); } static int spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) { zio_t *pio = arg; zio_nowait(zio_free_sync(pio, pio->io_spa, dmu_tx_get_txg(tx), bp, pio->io_flags)); return (0); } static int bpobj_spa_free_sync_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) { ASSERT(!bp_freed); return (spa_free_sync_cb(arg, bp, tx)); } /* * Note: this simple function is not inlined to make it easier to dtrace the * amount of time spent syncing frees. */ static void spa_sync_frees(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx) { zio_t *zio = zio_root(spa, NULL, NULL, 0); bplist_iterate(bpl, spa_free_sync_cb, zio, tx); VERIFY(zio_wait(zio) == 0); } /* * Note: this simple function is not inlined to make it easier to dtrace the * amount of time spent syncing deferred frees. */ static void spa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx) { if (spa_sync_pass(spa) != 1) return; /* * Note: * If the log space map feature is active, we stop deferring * frees to the next TXG and therefore running this function * would be considered a no-op as spa_deferred_bpobj should * not have any entries. * * That said we run this function anyway (instead of returning * immediately) for the edge-case scenario where we just * activated the log space map feature in this TXG but we have * deferred frees from the previous TXG. */ zio_t *zio = zio_root(spa, NULL, NULL, 0); VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj, bpobj_spa_free_sync_cb, zio, tx), ==, 0); VERIFY0(zio_wait(zio)); } static void spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) { char *packed = NULL; size_t bufsize; size_t nvsize = 0; dmu_buf_t *db; VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); /* * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration * information. This avoids the dmu_buf_will_dirty() path and * saves us a pre-read to get data we don't actually care about. */ bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE); packed = vmem_alloc(bufsize, KM_SLEEP); VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, KM_SLEEP) == 0); memset(packed + nvsize, 0, bufsize - nvsize); dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx); vmem_free(packed, bufsize); VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); dmu_buf_will_dirty(db, tx); *(uint64_t *)db->db_data = nvsize; dmu_buf_rele(db, FTAG); } static void spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx, const char *config, const char *entry) { nvlist_t *nvroot; nvlist_t **list; int i; if (!sav->sav_sync) return; /* * Update the MOS nvlist describing the list of available devices. * spa_validate_aux() will have already made sure this nvlist is * valid and the vdevs are labeled appropriately. */ if (sav->sav_object == 0) { sav->sav_object = dmu_object_alloc(spa->spa_meta_objset, DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); VERIFY(zap_update(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1, &sav->sav_object, tx) == 0); } nvroot = fnvlist_alloc(); if (sav->sav_count == 0) { fnvlist_add_nvlist_array(nvroot, config, (const nvlist_t * const *)NULL, 0); } else { list = kmem_alloc(sav->sav_count*sizeof (void *), KM_SLEEP); for (i = 0; i < sav->sav_count; i++) list[i] = vdev_config_generate(spa, sav->sav_vdevs[i], B_FALSE, VDEV_CONFIG_L2CACHE); fnvlist_add_nvlist_array(nvroot, config, (const nvlist_t * const *)list, sav->sav_count); for (i = 0; i < sav->sav_count; i++) nvlist_free(list[i]); kmem_free(list, sav->sav_count * sizeof (void *)); } spa_sync_nvlist(spa, sav->sav_object, nvroot, tx); nvlist_free(nvroot); sav->sav_sync = B_FALSE; } /* * Rebuild spa's all-vdev ZAP from the vdev ZAPs indicated in each vdev_t. * The all-vdev ZAP must be empty. */ static void spa_avz_build(vdev_t *vd, uint64_t avz, dmu_tx_t *tx) { spa_t *spa = vd->vdev_spa; if (vd->vdev_top_zap != 0) { VERIFY0(zap_add_int(spa->spa_meta_objset, avz, vd->vdev_top_zap, tx)); } if (vd->vdev_leaf_zap != 0) { VERIFY0(zap_add_int(spa->spa_meta_objset, avz, vd->vdev_leaf_zap, tx)); } for (uint64_t i = 0; i < vd->vdev_children; i++) { spa_avz_build(vd->vdev_child[i], avz, tx); } } static void spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) { nvlist_t *config; /* * If the pool is being imported from a pre-per-vdev-ZAP version of ZFS, * its config may not be dirty but we still need to build per-vdev ZAPs. * Similarly, if the pool is being assembled (e.g. after a split), we * need to rebuild the AVZ although the config may not be dirty. */ if (list_is_empty(&spa->spa_config_dirty_list) && spa->spa_avz_action == AVZ_ACTION_NONE) return; spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); ASSERT(spa->spa_avz_action == AVZ_ACTION_NONE || spa->spa_avz_action == AVZ_ACTION_INITIALIZE || spa->spa_all_vdev_zaps != 0); if (spa->spa_avz_action == AVZ_ACTION_REBUILD) { /* Make and build the new AVZ */ uint64_t new_avz = zap_create(spa->spa_meta_objset, DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx); spa_avz_build(spa->spa_root_vdev, new_avz, tx); /* Diff old AVZ with new one */ zap_cursor_t zc; zap_attribute_t za; for (zap_cursor_init(&zc, spa->spa_meta_objset, spa->spa_all_vdev_zaps); zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { uint64_t vdzap = za.za_first_integer; if (zap_lookup_int(spa->spa_meta_objset, new_avz, vdzap) == ENOENT) { /* * ZAP is listed in old AVZ but not in new one; * destroy it */ VERIFY0(zap_destroy(spa->spa_meta_objset, vdzap, tx)); } } zap_cursor_fini(&zc); /* Destroy the old AVZ */ VERIFY0(zap_destroy(spa->spa_meta_objset, spa->spa_all_vdev_zaps, tx)); /* Replace the old AVZ in the dir obj with the new one */ VERIFY0(zap_update(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, sizeof (new_avz), 1, &new_avz, tx)); spa->spa_all_vdev_zaps = new_avz; } else if (spa->spa_avz_action == AVZ_ACTION_DESTROY) { zap_cursor_t zc; zap_attribute_t za; /* Walk through the AVZ and destroy all listed ZAPs */ for (zap_cursor_init(&zc, spa->spa_meta_objset, spa->spa_all_vdev_zaps); zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { uint64_t zap = za.za_first_integer; VERIFY0(zap_destroy(spa->spa_meta_objset, zap, tx)); } zap_cursor_fini(&zc); /* Destroy and unlink the AVZ itself */ VERIFY0(zap_destroy(spa->spa_meta_objset, spa->spa_all_vdev_zaps, tx)); VERIFY0(zap_remove(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, tx)); spa->spa_all_vdev_zaps = 0; } if (spa->spa_all_vdev_zaps == 0) { spa->spa_all_vdev_zaps = zap_create_link(spa->spa_meta_objset, DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, tx); } spa->spa_avz_action = AVZ_ACTION_NONE; /* Create ZAPs for vdevs that don't have them. */ vdev_construct_zaps(spa->spa_root_vdev, tx); config = spa_config_generate(spa, spa->spa_root_vdev, dmu_tx_get_txg(tx), B_FALSE); /* * If we're upgrading the spa version then make sure that * the config object gets updated with the correct version. */ if (spa->spa_ubsync.ub_version < spa->spa_uberblock.ub_version) fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, spa->spa_uberblock.ub_version); spa_config_exit(spa, SCL_STATE, FTAG); nvlist_free(spa->spa_config_syncing); spa->spa_config_syncing = config; spa_sync_nvlist(spa, spa->spa_config_object, config, tx); } static void spa_sync_version(void *arg, dmu_tx_t *tx) { uint64_t *versionp = arg; uint64_t version = *versionp; spa_t *spa = dmu_tx_pool(tx)->dp_spa; /* * Setting the version is special cased when first creating the pool. */ ASSERT(tx->tx_txg != TXG_INITIAL); ASSERT(SPA_VERSION_IS_SUPPORTED(version)); ASSERT(version >= spa_version(spa)); spa->spa_uberblock.ub_version = version; vdev_config_dirty(spa->spa_root_vdev); spa_history_log_internal(spa, "set", tx, "version=%lld", (longlong_t)version); } /* * Set zpool properties. */ static void spa_sync_props(void *arg, dmu_tx_t *tx) { nvlist_t *nvp = arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; objset_t *mos = spa->spa_meta_objset; nvpair_t *elem = NULL; mutex_enter(&spa->spa_props_lock); while ((elem = nvlist_next_nvpair(nvp, elem))) { uint64_t intval; char *strval, *fname; zpool_prop_t prop; const char *propname; zprop_type_t proptype; spa_feature_t fid; switch (prop = zpool_name_to_prop(nvpair_name(elem))) { case ZPOOL_PROP_INVAL: /* * We checked this earlier in spa_prop_validate(). */ ASSERT(zpool_prop_feature(nvpair_name(elem))); fname = strchr(nvpair_name(elem), '@') + 1; VERIFY0(zfeature_lookup_name(fname, &fid)); spa_feature_enable(spa, fid, tx); spa_history_log_internal(spa, "set", tx, "%s=enabled", nvpair_name(elem)); break; case ZPOOL_PROP_VERSION: intval = fnvpair_value_uint64(elem); /* * The version is synced separately before other * properties and should be correct by now. */ ASSERT3U(spa_version(spa), >=, intval); break; case ZPOOL_PROP_ALTROOT: /* * 'altroot' is a non-persistent property. It should * have been set temporarily at creation or import time. */ ASSERT(spa->spa_root != NULL); break; case ZPOOL_PROP_READONLY: case ZPOOL_PROP_CACHEFILE: /* * 'readonly' and 'cachefile' are also non-persistent * properties. */ break; case ZPOOL_PROP_COMMENT: strval = fnvpair_value_string(elem); if (spa->spa_comment != NULL) spa_strfree(spa->spa_comment); spa->spa_comment = spa_strdup(strval); /* * We need to dirty the configuration on all the vdevs * so that their labels get updated. We also need to * update the cache file to keep it in sync with the * MOS version. It's unnecessary to do this for pool * creation since the vdev's configuration has already * been dirtied. */ if (tx->tx_txg != TXG_INITIAL) { vdev_config_dirty(spa->spa_root_vdev); spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); } spa_history_log_internal(spa, "set", tx, "%s=%s", nvpair_name(elem), strval); break; case ZPOOL_PROP_COMPATIBILITY: strval = fnvpair_value_string(elem); if (spa->spa_compatibility != NULL) spa_strfree(spa->spa_compatibility); spa->spa_compatibility = spa_strdup(strval); /* * Dirty the configuration on vdevs as above. */ if (tx->tx_txg != TXG_INITIAL) { vdev_config_dirty(spa->spa_root_vdev); spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); } spa_history_log_internal(spa, "set", tx, "%s=%s", nvpair_name(elem), strval); break; default: /* * Set pool property values in the poolprops mos object. */ if (spa->spa_pool_props_object == 0) { spa->spa_pool_props_object = zap_create_link(mos, DMU_OT_POOL_PROPS, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS, tx); } /* normalize the property name */ propname = zpool_prop_to_name(prop); proptype = zpool_prop_get_type(prop); if (nvpair_type(elem) == DATA_TYPE_STRING) { ASSERT(proptype == PROP_TYPE_STRING); strval = fnvpair_value_string(elem); VERIFY0(zap_update(mos, spa->spa_pool_props_object, propname, 1, strlen(strval) + 1, strval, tx)); spa_history_log_internal(spa, "set", tx, "%s=%s", nvpair_name(elem), strval); } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { intval = fnvpair_value_uint64(elem); if (proptype == PROP_TYPE_INDEX) { const char *unused; VERIFY0(zpool_prop_index_to_string( prop, intval, &unused)); } VERIFY0(zap_update(mos, spa->spa_pool_props_object, propname, 8, 1, &intval, tx)); spa_history_log_internal(spa, "set", tx, "%s=%lld", nvpair_name(elem), (longlong_t)intval); } else { ASSERT(0); /* not allowed */ } switch (prop) { case ZPOOL_PROP_DELEGATION: spa->spa_delegation = intval; break; case ZPOOL_PROP_BOOTFS: spa->spa_bootfs = intval; break; case ZPOOL_PROP_FAILUREMODE: spa->spa_failmode = intval; break; case ZPOOL_PROP_AUTOTRIM: spa->spa_autotrim = intval; spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART); break; case ZPOOL_PROP_AUTOEXPAND: spa->spa_autoexpand = intval; if (tx->tx_txg != TXG_INITIAL) spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); break; case ZPOOL_PROP_MULTIHOST: spa->spa_multihost = intval; break; default: break; } } } mutex_exit(&spa->spa_props_lock); } /* * Perform one-time upgrade on-disk changes. spa_version() does not * reflect the new version this txg, so there must be no changes this * txg to anything that the upgrade code depends on after it executes. * Therefore this must be called after dsl_pool_sync() does the sync * tasks. */ static void spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx) { if (spa_sync_pass(spa) != 1) return; dsl_pool_t *dp = spa->spa_dsl_pool; rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN && spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) { dsl_pool_create_origin(dp, tx); /* Keeping the origin open increases spa_minref */ spa->spa_minref += 3; } if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES && spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) { dsl_pool_upgrade_clones(dp, tx); } if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES && spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) { dsl_pool_upgrade_dir_clones(dp, tx); /* Keeping the freedir open increases spa_minref */ spa->spa_minref += 3; } if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES && spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { spa_feature_create_zap_objects(spa, tx); } /* * LZ4_COMPRESS feature's behaviour was changed to activate_on_enable * when possibility to use lz4 compression for metadata was added * Old pools that have this feature enabled must be upgraded to have * this feature active */ if (spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { boolean_t lz4_en = spa_feature_is_enabled(spa, SPA_FEATURE_LZ4_COMPRESS); boolean_t lz4_ac = spa_feature_is_active(spa, SPA_FEATURE_LZ4_COMPRESS); if (lz4_en && !lz4_ac) spa_feature_incr(spa, SPA_FEATURE_LZ4_COMPRESS, tx); } /* * If we haven't written the salt, do so now. Note that the * feature may not be activated yet, but that's fine since * the presence of this ZAP entry is backwards compatible. */ if (zap_contains(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CHECKSUM_SALT) == ENOENT) { VERIFY0(zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CHECKSUM_SALT, 1, sizeof (spa->spa_cksum_salt.zcs_bytes), spa->spa_cksum_salt.zcs_bytes, tx)); } rrw_exit(&dp->dp_config_rwlock, FTAG); } static void vdev_indirect_state_sync_verify(vdev_t *vd) { vdev_indirect_mapping_t *vim __maybe_unused = vd->vdev_indirect_mapping; vdev_indirect_births_t *vib __maybe_unused = vd->vdev_indirect_births; if (vd->vdev_ops == &vdev_indirect_ops) { ASSERT(vim != NULL); ASSERT(vib != NULL); } uint64_t obsolete_sm_object = 0; ASSERT0(vdev_obsolete_sm_object(vd, &obsolete_sm_object)); if (obsolete_sm_object != 0) { ASSERT(vd->vdev_obsolete_sm != NULL); ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops); ASSERT(vdev_indirect_mapping_num_entries(vim) > 0); ASSERT(vdev_indirect_mapping_bytes_mapped(vim) > 0); ASSERT3U(obsolete_sm_object, ==, space_map_object(vd->vdev_obsolete_sm)); ASSERT3U(vdev_indirect_mapping_bytes_mapped(vim), >=, space_map_allocated(vd->vdev_obsolete_sm)); } ASSERT(vd->vdev_obsolete_segments != NULL); /* * Since frees / remaps to an indirect vdev can only * happen in syncing context, the obsolete segments * tree must be empty when we start syncing. */ ASSERT0(range_tree_space(vd->vdev_obsolete_segments)); } /* * Set the top-level vdev's max queue depth. Evaluate each top-level's * async write queue depth in case it changed. The max queue depth will * not change in the middle of syncing out this txg. */ static void spa_sync_adjust_vdev_max_queue_depth(spa_t *spa) { ASSERT(spa_writeable(spa)); vdev_t *rvd = spa->spa_root_vdev; uint32_t max_queue_depth = zfs_vdev_async_write_max_active * zfs_vdev_queue_depth_pct / 100; metaslab_class_t *normal = spa_normal_class(spa); metaslab_class_t *special = spa_special_class(spa); metaslab_class_t *dedup = spa_dedup_class(spa); uint64_t slots_per_allocator = 0; for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; metaslab_group_t *mg = tvd->vdev_mg; if (mg == NULL || !metaslab_group_initialized(mg)) continue; metaslab_class_t *mc = mg->mg_class; if (mc != normal && mc != special && mc != dedup) continue; /* * It is safe to do a lock-free check here because only async * allocations look at mg_max_alloc_queue_depth, and async * allocations all happen from spa_sync(). */ for (int i = 0; i < mg->mg_allocators; i++) { ASSERT0(zfs_refcount_count( &(mg->mg_allocator[i].mga_alloc_queue_depth))); } mg->mg_max_alloc_queue_depth = max_queue_depth; for (int i = 0; i < mg->mg_allocators; i++) { mg->mg_allocator[i].mga_cur_max_alloc_queue_depth = zfs_vdev_def_queue_depth; } slots_per_allocator += zfs_vdev_def_queue_depth; } for (int i = 0; i < spa->spa_alloc_count; i++) { ASSERT0(zfs_refcount_count(&normal->mc_allocator[i]. mca_alloc_slots)); ASSERT0(zfs_refcount_count(&special->mc_allocator[i]. mca_alloc_slots)); ASSERT0(zfs_refcount_count(&dedup->mc_allocator[i]. mca_alloc_slots)); normal->mc_allocator[i].mca_alloc_max_slots = slots_per_allocator; special->mc_allocator[i].mca_alloc_max_slots = slots_per_allocator; dedup->mc_allocator[i].mca_alloc_max_slots = slots_per_allocator; } normal->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; special->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; dedup->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; } static void spa_sync_condense_indirect(spa_t *spa, dmu_tx_t *tx) { ASSERT(spa_writeable(spa)); vdev_t *rvd = spa->spa_root_vdev; for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *vd = rvd->vdev_child[c]; vdev_indirect_state_sync_verify(vd); if (vdev_indirect_should_condense(vd)) { spa_condense_indirect_start_sync(vd, tx); break; } } } static void spa_sync_iterate_to_convergence(spa_t *spa, dmu_tx_t *tx) { objset_t *mos = spa->spa_meta_objset; dsl_pool_t *dp = spa->spa_dsl_pool; uint64_t txg = tx->tx_txg; bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK]; do { int pass = ++spa->spa_sync_pass; spa_sync_config_object(spa, tx); spa_sync_aux_dev(spa, &spa->spa_spares, tx, ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES); spa_sync_aux_dev(spa, &spa->spa_l2cache, tx, ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE); spa_errlog_sync(spa, txg); dsl_pool_sync(dp, txg); if (pass < zfs_sync_pass_deferred_free || spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) { /* * If the log space map feature is active we don't * care about deferred frees and the deferred bpobj * as the log space map should effectively have the * same results (i.e. appending only to one object). */ spa_sync_frees(spa, free_bpl, tx); } else { /* * We can not defer frees in pass 1, because * we sync the deferred frees later in pass 1. */ ASSERT3U(pass, >, 1); bplist_iterate(free_bpl, bpobj_enqueue_alloc_cb, &spa->spa_deferred_bpobj, tx); } ddt_sync(spa, txg); dsl_scan_sync(dp, tx); svr_sync(spa, tx); spa_sync_upgrades(spa, tx); spa_flush_metaslabs(spa, tx); vdev_t *vd = NULL; while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) != NULL) vdev_sync(vd, txg); /* * Note: We need to check if the MOS is dirty because we could * have marked the MOS dirty without updating the uberblock * (e.g. if we have sync tasks but no dirty user data). We need * to check the uberblock's rootbp because it is updated if we * have synced out dirty data (though in this case the MOS will * most likely also be dirty due to second order effects, we * don't want to rely on that here). */ if (pass == 1 && spa->spa_uberblock.ub_rootbp.blk_birth < txg && !dmu_objset_is_dirty(mos, txg)) { /* * Nothing changed on the first pass, therefore this * TXG is a no-op. Avoid syncing deferred frees, so * that we can keep this TXG as a no-op. */ ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); ASSERT(txg_list_empty(&dp->dp_sync_tasks, txg)); ASSERT(txg_list_empty(&dp->dp_early_sync_tasks, txg)); break; } spa_sync_deferred_frees(spa, tx); } while (dmu_objset_is_dirty(mos, txg)); } /* * Rewrite the vdev configuration (which includes the uberblock) to * commit the transaction group. * * If there are no dirty vdevs, we sync the uberblock to a few random * top-level vdevs that are known to be visible in the config cache * (see spa_vdev_add() for a complete description). If there *are* dirty * vdevs, sync the uberblock to all vdevs. */ static void spa_sync_rewrite_vdev_config(spa_t *spa, dmu_tx_t *tx) { vdev_t *rvd = spa->spa_root_vdev; uint64_t txg = tx->tx_txg; for (;;) { int error = 0; /* * We hold SCL_STATE to prevent vdev open/close/etc. * while we're attempting to write the vdev labels. */ spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); if (list_is_empty(&spa->spa_config_dirty_list)) { vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL }; int svdcount = 0; int children = rvd->vdev_children; int c0 = random_in_range(children); for (int c = 0; c < children; c++) { vdev_t *vd = rvd->vdev_child[(c0 + c) % children]; /* Stop when revisiting the first vdev */ if (c > 0 && svd[0] == vd) break; if (vd->vdev_ms_array == 0 || vd->vdev_islog || !vdev_is_concrete(vd)) continue; svd[svdcount++] = vd; if (svdcount == SPA_SYNC_MIN_VDEVS) break; } error = vdev_config_sync(svd, svdcount, txg); } else { error = vdev_config_sync(rvd->vdev_child, rvd->vdev_children, txg); } if (error == 0) spa->spa_last_synced_guid = rvd->vdev_guid; spa_config_exit(spa, SCL_STATE, FTAG); if (error == 0) break; zio_suspend(spa, NULL, ZIO_SUSPEND_IOERR); zio_resume_wait(spa); } } /* * Sync the specified transaction group. New blocks may be dirtied as * part of the process, so we iterate until it converges. */ void spa_sync(spa_t *spa, uint64_t txg) { vdev_t *vd = NULL; VERIFY(spa_writeable(spa)); /* * Wait for i/os issued in open context that need to complete * before this txg syncs. */ (void) zio_wait(spa->spa_txg_zio[txg & TXG_MASK]); spa->spa_txg_zio[txg & TXG_MASK] = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); /* * Lock out configuration changes. */ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); spa->spa_syncing_txg = txg; spa->spa_sync_pass = 0; for (int i = 0; i < spa->spa_alloc_count; i++) { mutex_enter(&spa->spa_allocs[i].spaa_lock); VERIFY0(avl_numnodes(&spa->spa_allocs[i].spaa_tree)); mutex_exit(&spa->spa_allocs[i].spaa_lock); } /* * If there are any pending vdev state changes, convert them * into config changes that go out with this transaction group. */ spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); while (list_head(&spa->spa_state_dirty_list) != NULL) { /* * We need the write lock here because, for aux vdevs, * calling vdev_config_dirty() modifies sav_config. * This is ugly and will become unnecessary when we * eliminate the aux vdev wart by integrating all vdevs * into the root vdev tree. */ spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER); while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { vdev_state_clean(vd); vdev_config_dirty(vd); } spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); } spa_config_exit(spa, SCL_STATE, FTAG); dsl_pool_t *dp = spa->spa_dsl_pool; dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg); spa->spa_sync_starttime = gethrtime(); taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid); spa->spa_deadman_tqid = taskq_dispatch_delay(system_delay_taskq, spa_deadman, spa, TQ_SLEEP, ddi_get_lbolt() + NSEC_TO_TICK(spa->spa_deadman_synctime)); /* * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, * set spa_deflate if we have no raid-z vdevs. */ if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE && spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) { vdev_t *rvd = spa->spa_root_vdev; int i; for (i = 0; i < rvd->vdev_children; i++) { vd = rvd->vdev_child[i]; if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) break; } if (i == rvd->vdev_children) { spa->spa_deflate = TRUE; VERIFY0(zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, sizeof (uint64_t), 1, &spa->spa_deflate, tx)); } } spa_sync_adjust_vdev_max_queue_depth(spa); spa_sync_condense_indirect(spa, tx); spa_sync_iterate_to_convergence(spa, tx); #ifdef ZFS_DEBUG if (!list_is_empty(&spa->spa_config_dirty_list)) { /* * Make sure that the number of ZAPs for all the vdevs matches * the number of ZAPs in the per-vdev ZAP list. This only gets * called if the config is dirty; otherwise there may be * outstanding AVZ operations that weren't completed in * spa_sync_config_object. */ uint64_t all_vdev_zap_entry_count; ASSERT0(zap_count(spa->spa_meta_objset, spa->spa_all_vdev_zaps, &all_vdev_zap_entry_count)); ASSERT3U(vdev_count_verify_zaps(spa->spa_root_vdev), ==, all_vdev_zap_entry_count); } #endif if (spa->spa_vdev_removal != NULL) { ASSERT0(spa->spa_vdev_removal->svr_bytes_done[txg & TXG_MASK]); } spa_sync_rewrite_vdev_config(spa, tx); dmu_tx_commit(tx); taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid); spa->spa_deadman_tqid = 0; /* * Clear the dirty config list. */ while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL) vdev_config_clean(vd); /* * Now that the new config has synced transactionally, * let it become visible to the config cache. */ if (spa->spa_config_syncing != NULL) { spa_config_set(spa, spa->spa_config_syncing); spa->spa_config_txg = txg; spa->spa_config_syncing = NULL; } dsl_pool_sync_done(dp, txg); for (int i = 0; i < spa->spa_alloc_count; i++) { mutex_enter(&spa->spa_allocs[i].spaa_lock); VERIFY0(avl_numnodes(&spa->spa_allocs[i].spaa_tree)); mutex_exit(&spa->spa_allocs[i].spaa_lock); } /* * Update usable space statistics. */ while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) != NULL) vdev_sync_done(vd, txg); metaslab_class_evict_old(spa->spa_normal_class, txg); metaslab_class_evict_old(spa->spa_log_class, txg); spa_sync_close_syncing_log_sm(spa); spa_update_dspace(spa); /* * It had better be the case that we didn't dirty anything * since vdev_config_sync(). */ ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); while (zfs_pause_spa_sync) delay(1); spa->spa_sync_pass = 0; /* * Update the last synced uberblock here. We want to do this at * the end of spa_sync() so that consumers of spa_last_synced_txg() * will be guaranteed that all the processing associated with * that txg has been completed. */ spa->spa_ubsync = spa->spa_uberblock; spa_config_exit(spa, SCL_CONFIG, FTAG); spa_handle_ignored_writes(spa); /* * If any async tasks have been requested, kick them off. */ spa_async_dispatch(spa); } /* * Sync all pools. We don't want to hold the namespace lock across these * operations, so we take a reference on the spa_t and drop the lock during the * sync. */ void spa_sync_allpools(void) { spa_t *spa = NULL; mutex_enter(&spa_namespace_lock); while ((spa = spa_next(spa)) != NULL) { if (spa_state(spa) != POOL_STATE_ACTIVE || !spa_writeable(spa) || spa_suspended(spa)) continue; spa_open_ref(spa, FTAG); mutex_exit(&spa_namespace_lock); txg_wait_synced(spa_get_dsl(spa), 0); mutex_enter(&spa_namespace_lock); spa_close(spa, FTAG); } mutex_exit(&spa_namespace_lock); } /* * ========================================================================== * Miscellaneous routines * ========================================================================== */ /* * Remove all pools in the system. */ void spa_evict_all(void) { spa_t *spa; /* * Remove all cached state. All pools should be closed now, * so every spa in the AVL tree should be unreferenced. */ mutex_enter(&spa_namespace_lock); while ((spa = spa_next(NULL)) != NULL) { /* * Stop async tasks. The async thread may need to detach * a device that's been replaced, which requires grabbing * spa_namespace_lock, so we must drop it here. */ spa_open_ref(spa, FTAG); mutex_exit(&spa_namespace_lock); spa_async_suspend(spa); mutex_enter(&spa_namespace_lock); spa_close(spa, FTAG); if (spa->spa_state != POOL_STATE_UNINITIALIZED) { spa_unload(spa); spa_deactivate(spa); } spa_remove(spa); } mutex_exit(&spa_namespace_lock); } vdev_t * spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux) { vdev_t *vd; int i; if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL) return (vd); if (aux) { for (i = 0; i < spa->spa_l2cache.sav_count; i++) { vd = spa->spa_l2cache.sav_vdevs[i]; if (vd->vdev_guid == guid) return (vd); } for (i = 0; i < spa->spa_spares.sav_count; i++) { vd = spa->spa_spares.sav_vdevs[i]; if (vd->vdev_guid == guid) return (vd); } } return (NULL); } void spa_upgrade(spa_t *spa, uint64_t version) { ASSERT(spa_writeable(spa)); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); /* * This should only be called for a non-faulted pool, and since a * future version would result in an unopenable pool, this shouldn't be * possible. */ ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version)); ASSERT3U(version, >=, spa->spa_uberblock.ub_version); spa->spa_uberblock.ub_version = version; vdev_config_dirty(spa->spa_root_vdev); spa_config_exit(spa, SCL_ALL, FTAG); txg_wait_synced(spa_get_dsl(spa), 0); } static boolean_t spa_has_aux_vdev(spa_t *spa, uint64_t guid, spa_aux_vdev_t *sav) { (void) spa; int i; uint64_t vdev_guid; for (i = 0; i < sav->sav_count; i++) if (sav->sav_vdevs[i]->vdev_guid == guid) return (B_TRUE); for (i = 0; i < sav->sav_npending; i++) { if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID, &vdev_guid) == 0 && vdev_guid == guid) return (B_TRUE); } return (B_FALSE); } boolean_t spa_has_l2cache(spa_t *spa, uint64_t guid) { return (spa_has_aux_vdev(spa, guid, &spa->spa_l2cache)); } boolean_t spa_has_spare(spa_t *spa, uint64_t guid) { return (spa_has_aux_vdev(spa, guid, &spa->spa_spares)); } /* * Check if a pool has an active shared spare device. * Note: reference count of an active spare is 2, as a spare and as a replace */ static boolean_t spa_has_active_shared_spare(spa_t *spa) { int i, refcnt; uint64_t pool; spa_aux_vdev_t *sav = &spa->spa_spares; for (i = 0; i < sav->sav_count; i++) { if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool, &refcnt) && pool != 0ULL && pool == spa_guid(spa) && refcnt > 2) return (B_TRUE); } return (B_FALSE); } uint64_t spa_total_metaslabs(spa_t *spa) { vdev_t *rvd = spa->spa_root_vdev; uint64_t m = 0; for (uint64_t c = 0; c < rvd->vdev_children; c++) { vdev_t *vd = rvd->vdev_child[c]; if (!vdev_is_concrete(vd)) continue; m += vd->vdev_ms_count; } return (m); } /* * Notify any waiting threads that some activity has switched from being in- * progress to not-in-progress so that the thread can wake up and determine * whether it is finished waiting. */ void spa_notify_waiters(spa_t *spa) { /* * Acquiring spa_activities_lock here prevents the cv_broadcast from * happening between the waiting thread's check and cv_wait. */ mutex_enter(&spa->spa_activities_lock); cv_broadcast(&spa->spa_activities_cv); mutex_exit(&spa->spa_activities_lock); } /* * Notify any waiting threads that the pool is exporting, and then block until * they are finished using the spa_t. */ void spa_wake_waiters(spa_t *spa) { mutex_enter(&spa->spa_activities_lock); spa->spa_waiters_cancel = B_TRUE; cv_broadcast(&spa->spa_activities_cv); while (spa->spa_waiters != 0) cv_wait(&spa->spa_waiters_cv, &spa->spa_activities_lock); spa->spa_waiters_cancel = B_FALSE; mutex_exit(&spa->spa_activities_lock); } /* Whether the vdev or any of its descendants are being initialized/trimmed. */ static boolean_t spa_vdev_activity_in_progress_impl(vdev_t *vd, zpool_wait_activity_t activity) { spa_t *spa = vd->vdev_spa; ASSERT(spa_config_held(spa, SCL_CONFIG | SCL_STATE, RW_READER)); ASSERT(MUTEX_HELD(&spa->spa_activities_lock)); ASSERT(activity == ZPOOL_WAIT_INITIALIZE || activity == ZPOOL_WAIT_TRIM); kmutex_t *lock = activity == ZPOOL_WAIT_INITIALIZE ? &vd->vdev_initialize_lock : &vd->vdev_trim_lock; mutex_exit(&spa->spa_activities_lock); mutex_enter(lock); mutex_enter(&spa->spa_activities_lock); boolean_t in_progress = (activity == ZPOOL_WAIT_INITIALIZE) ? (vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE) : (vd->vdev_trim_state == VDEV_TRIM_ACTIVE); mutex_exit(lock); if (in_progress) return (B_TRUE); for (int i = 0; i < vd->vdev_children; i++) { if (spa_vdev_activity_in_progress_impl(vd->vdev_child[i], activity)) return (B_TRUE); } return (B_FALSE); } /* * If use_guid is true, this checks whether the vdev specified by guid is * being initialized/trimmed. Otherwise, it checks whether any vdev in the pool * is being initialized/trimmed. The caller must hold the config lock and * spa_activities_lock. */ static int spa_vdev_activity_in_progress(spa_t *spa, boolean_t use_guid, uint64_t guid, zpool_wait_activity_t activity, boolean_t *in_progress) { mutex_exit(&spa->spa_activities_lock); spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); mutex_enter(&spa->spa_activities_lock); vdev_t *vd; if (use_guid) { vd = spa_lookup_by_guid(spa, guid, B_FALSE); if (vd == NULL || !vd->vdev_ops->vdev_op_leaf) { spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); return (EINVAL); } } else { vd = spa->spa_root_vdev; } *in_progress = spa_vdev_activity_in_progress_impl(vd, activity); spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); return (0); } /* * Locking for waiting threads * --------------------------- * * Waiting threads need a way to check whether a given activity is in progress, * and then, if it is, wait for it to complete. Each activity will have some * in-memory representation of the relevant on-disk state which can be used to * determine whether or not the activity is in progress. The in-memory state and * the locking used to protect it will be different for each activity, and may * not be suitable for use with a cvar (e.g., some state is protected by the * config lock). To allow waiting threads to wait without any races, another * lock, spa_activities_lock, is used. * * When the state is checked, both the activity-specific lock (if there is one) * and spa_activities_lock are held. In some cases, the activity-specific lock * is acquired explicitly (e.g. the config lock). In others, the locking is * internal to some check (e.g. bpobj_is_empty). After checking, the waiting * thread releases the activity-specific lock and, if the activity is in * progress, then cv_waits using spa_activities_lock. * * The waiting thread is woken when another thread, one completing some * activity, updates the state of the activity and then calls * spa_notify_waiters, which will cv_broadcast. This 'completing' thread only * needs to hold its activity-specific lock when updating the state, and this * lock can (but doesn't have to) be dropped before calling spa_notify_waiters. * * Because spa_notify_waiters acquires spa_activities_lock before broadcasting, * and because it is held when the waiting thread checks the state of the * activity, it can never be the case that the completing thread both updates * the activity state and cv_broadcasts in between the waiting thread's check * and cv_wait. Thus, a waiting thread can never miss a wakeup. * * In order to prevent deadlock, when the waiting thread does its check, in some * cases it will temporarily drop spa_activities_lock in order to acquire the * activity-specific lock. The order in which spa_activities_lock and the * activity specific lock are acquired in the waiting thread is determined by * the order in which they are acquired in the completing thread; if the * completing thread calls spa_notify_waiters with the activity-specific lock * held, then the waiting thread must also acquire the activity-specific lock * first. */ static int spa_activity_in_progress(spa_t *spa, zpool_wait_activity_t activity, boolean_t use_tag, uint64_t tag, boolean_t *in_progress) { int error = 0; ASSERT(MUTEX_HELD(&spa->spa_activities_lock)); switch (activity) { case ZPOOL_WAIT_CKPT_DISCARD: *in_progress = (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT) && zap_contains(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT) == ENOENT); break; case ZPOOL_WAIT_FREE: *in_progress = ((spa_version(spa) >= SPA_VERSION_DEADLISTS && !bpobj_is_empty(&spa->spa_dsl_pool->dp_free_bpobj)) || spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY) || spa_livelist_delete_check(spa)); break; case ZPOOL_WAIT_INITIALIZE: case ZPOOL_WAIT_TRIM: error = spa_vdev_activity_in_progress(spa, use_tag, tag, activity, in_progress); break; case ZPOOL_WAIT_REPLACE: mutex_exit(&spa->spa_activities_lock); spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); mutex_enter(&spa->spa_activities_lock); *in_progress = vdev_replace_in_progress(spa->spa_root_vdev); spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); break; case ZPOOL_WAIT_REMOVE: *in_progress = (spa->spa_removing_phys.sr_state == DSS_SCANNING); break; case ZPOOL_WAIT_RESILVER: if ((*in_progress = vdev_rebuild_active(spa->spa_root_vdev))) break; zfs_fallthrough; case ZPOOL_WAIT_SCRUB: { boolean_t scanning, paused, is_scrub; dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; is_scrub = (scn->scn_phys.scn_func == POOL_SCAN_SCRUB); scanning = (scn->scn_phys.scn_state == DSS_SCANNING); paused = dsl_scan_is_paused_scrub(scn); *in_progress = (scanning && !paused && is_scrub == (activity == ZPOOL_WAIT_SCRUB)); break; } default: panic("unrecognized value for activity %d", activity); } return (error); } static int spa_wait_common(const char *pool, zpool_wait_activity_t activity, boolean_t use_tag, uint64_t tag, boolean_t *waited) { /* * The tag is used to distinguish between instances of an activity. * 'initialize' and 'trim' are the only activities that we use this for. * The other activities can only have a single instance in progress in a * pool at one time, making the tag unnecessary. * * There can be multiple devices being replaced at once, but since they * all finish once resilvering finishes, we don't bother keeping track * of them individually, we just wait for them all to finish. */ if (use_tag && activity != ZPOOL_WAIT_INITIALIZE && activity != ZPOOL_WAIT_TRIM) return (EINVAL); if (activity < 0 || activity >= ZPOOL_WAIT_NUM_ACTIVITIES) return (EINVAL); spa_t *spa; int error = spa_open(pool, &spa, FTAG); if (error != 0) return (error); /* * Increment the spa's waiter count so that we can call spa_close and * still ensure that the spa_t doesn't get freed before this thread is * finished with it when the pool is exported. We want to call spa_close * before we start waiting because otherwise the additional ref would * prevent the pool from being exported or destroyed throughout the * potentially long wait. */ mutex_enter(&spa->spa_activities_lock); spa->spa_waiters++; spa_close(spa, FTAG); *waited = B_FALSE; for (;;) { boolean_t in_progress; error = spa_activity_in_progress(spa, activity, use_tag, tag, &in_progress); if (error || !in_progress || spa->spa_waiters_cancel) break; *waited = B_TRUE; if (cv_wait_sig(&spa->spa_activities_cv, &spa->spa_activities_lock) == 0) { error = EINTR; break; } } spa->spa_waiters--; cv_signal(&spa->spa_waiters_cv); mutex_exit(&spa->spa_activities_lock); return (error); } /* * Wait for a particular instance of the specified activity to complete, where * the instance is identified by 'tag' */ int spa_wait_tag(const char *pool, zpool_wait_activity_t activity, uint64_t tag, boolean_t *waited) { return (spa_wait_common(pool, activity, B_TRUE, tag, waited)); } /* * Wait for all instances of the specified activity complete */ int spa_wait(const char *pool, zpool_wait_activity_t activity, boolean_t *waited) { return (spa_wait_common(pool, activity, B_FALSE, 0, waited)); } sysevent_t * spa_event_create(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name) { sysevent_t *ev = NULL; #ifdef _KERNEL nvlist_t *resource; resource = zfs_event_create(spa, vd, FM_SYSEVENT_CLASS, name, hist_nvl); if (resource) { ev = kmem_alloc(sizeof (sysevent_t), KM_SLEEP); ev->resource = resource; } #else (void) spa, (void) vd, (void) hist_nvl, (void) name; #endif return (ev); } void spa_event_post(sysevent_t *ev) { #ifdef _KERNEL if (ev) { zfs_zevent_post(ev->resource, NULL, zfs_zevent_post_cb); kmem_free(ev, sizeof (*ev)); } #else (void) ev; #endif } /* * Post a zevent corresponding to the given sysevent. The 'name' must be one * of the event definitions in sys/sysevent/eventdefs.h. The payload will be * filled in from the spa and (optionally) the vdev. This doesn't do anything * in the userland libzpool, as we don't want consumers to misinterpret ztest * or zdb as real changes. */ void spa_event_notify(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name) { spa_event_post(spa_event_create(spa, vd, hist_nvl, name)); } /* state manipulation functions */ EXPORT_SYMBOL(spa_open); EXPORT_SYMBOL(spa_open_rewind); EXPORT_SYMBOL(spa_get_stats); EXPORT_SYMBOL(spa_create); EXPORT_SYMBOL(spa_import); EXPORT_SYMBOL(spa_tryimport); EXPORT_SYMBOL(spa_destroy); EXPORT_SYMBOL(spa_export); EXPORT_SYMBOL(spa_reset); EXPORT_SYMBOL(spa_async_request); EXPORT_SYMBOL(spa_async_suspend); EXPORT_SYMBOL(spa_async_resume); EXPORT_SYMBOL(spa_inject_addref); EXPORT_SYMBOL(spa_inject_delref); EXPORT_SYMBOL(spa_scan_stat_init); EXPORT_SYMBOL(spa_scan_get_stats); /* device manipulation */ EXPORT_SYMBOL(spa_vdev_add); EXPORT_SYMBOL(spa_vdev_attach); EXPORT_SYMBOL(spa_vdev_detach); EXPORT_SYMBOL(spa_vdev_setpath); EXPORT_SYMBOL(spa_vdev_setfru); EXPORT_SYMBOL(spa_vdev_split_mirror); /* spare statech is global across all pools) */ EXPORT_SYMBOL(spa_spare_add); EXPORT_SYMBOL(spa_spare_remove); EXPORT_SYMBOL(spa_spare_exists); EXPORT_SYMBOL(spa_spare_activate); /* L2ARC statech is global across all pools) */ EXPORT_SYMBOL(spa_l2cache_add); EXPORT_SYMBOL(spa_l2cache_remove); EXPORT_SYMBOL(spa_l2cache_exists); EXPORT_SYMBOL(spa_l2cache_activate); EXPORT_SYMBOL(spa_l2cache_drop); /* scanning */ EXPORT_SYMBOL(spa_scan); EXPORT_SYMBOL(spa_scan_stop); /* spa syncing */ EXPORT_SYMBOL(spa_sync); /* only for DMU use */ EXPORT_SYMBOL(spa_sync_allpools); /* properties */ EXPORT_SYMBOL(spa_prop_set); EXPORT_SYMBOL(spa_prop_get); EXPORT_SYMBOL(spa_prop_clear_bootfs); /* asynchronous event notification */ EXPORT_SYMBOL(spa_event_notify); /* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_shift, UINT, ZMOD_RW, "log2 fraction of arc that can be used by inflight I/Os when " "verifying pool during import"); /* END CSTYLED */ ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_metadata, INT, ZMOD_RW, "Set to traverse metadata on pool import"); ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_data, INT, ZMOD_RW, "Set to traverse data on pool import"); ZFS_MODULE_PARAM(zfs_spa, spa_, load_print_vdev_tree, INT, ZMOD_RW, "Print vdev tree to zfs_dbgmsg during pool import"); ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_pct, UINT, ZMOD_RD, "Percentage of CPUs to run an IO worker thread"); ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_tpq, UINT, ZMOD_RD, "Number of threads per IO worker taskqueue"); /* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs, zfs_, max_missing_tvds, U64, ZMOD_RW, "Allow importing pool with up to this number of missing top-level " "vdevs (in read-only mode)"); /* END CSTYLED */ ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, zthr_pause, INT, ZMOD_RW, "Set the livelist condense zthr to pause"); ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, sync_pause, INT, ZMOD_RW, "Set the livelist condense synctask to pause"); /* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, sync_cancel, INT, ZMOD_RW, "Whether livelist condensing was canceled in the synctask"); ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, zthr_cancel, INT, ZMOD_RW, "Whether livelist condensing was canceled in the zthr function"); ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, new_alloc, INT, ZMOD_RW, "Whether extra ALLOC blkptrs were added to a livelist entry while it " "was being condensed"); /* END CSTYLED */ diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 7d22a66c7819..8c62112de71b 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -1,6166 +1,6165 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2021 by Delphix. All rights reserved. * Copyright 2017 Nexenta Systems, Inc. * Copyright (c) 2014 Integros [integros.com] * Copyright 2016 Toomas Soome * Copyright 2017 Joyent, Inc. * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2019, Datto Inc. All rights reserved. * Copyright (c) 2021, Klara Inc. * Copyright [2021] Hewlett Packard Enterprise Development LP */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zfs_prop.h" /* * One metaslab from each (normal-class) vdev is used by the ZIL. These are * called "embedded slog metaslabs", are referenced by vdev_log_mg, and are * part of the spa_embedded_log_class. The metaslab with the most free space * in each vdev is selected for this purpose when the pool is opened (or a * vdev is added). See vdev_metaslab_init(). * * Log blocks can be allocated from the following locations. Each one is tried * in order until the allocation succeeds: * 1. dedicated log vdevs, aka "slog" (spa_log_class) * 2. embedded slog metaslabs (spa_embedded_log_class) * 3. other metaslabs in normal vdevs (spa_normal_class) * * zfs_embedded_slog_min_ms disables the embedded slog if there are fewer * than this number of metaslabs in the vdev. This ensures that we don't set * aside an unreasonable amount of space for the ZIL. If set to less than * 1 << (spa_slop_shift + 1), on small pools the usable space may be reduced * (by more than 1<vdev_path != NULL) { zfs_dbgmsg("%s vdev '%s': %s", vd->vdev_ops->vdev_op_type, vd->vdev_path, buf); } else { zfs_dbgmsg("%s-%llu vdev (guid %llu): %s", vd->vdev_ops->vdev_op_type, (u_longlong_t)vd->vdev_id, (u_longlong_t)vd->vdev_guid, buf); } } void vdev_dbgmsg_print_tree(vdev_t *vd, int indent) { char state[20]; if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops) { zfs_dbgmsg("%*svdev %llu: %s", indent, "", (u_longlong_t)vd->vdev_id, vd->vdev_ops->vdev_op_type); return; } switch (vd->vdev_state) { case VDEV_STATE_UNKNOWN: (void) snprintf(state, sizeof (state), "unknown"); break; case VDEV_STATE_CLOSED: (void) snprintf(state, sizeof (state), "closed"); break; case VDEV_STATE_OFFLINE: (void) snprintf(state, sizeof (state), "offline"); break; case VDEV_STATE_REMOVED: (void) snprintf(state, sizeof (state), "removed"); break; case VDEV_STATE_CANT_OPEN: (void) snprintf(state, sizeof (state), "can't open"); break; case VDEV_STATE_FAULTED: (void) snprintf(state, sizeof (state), "faulted"); break; case VDEV_STATE_DEGRADED: (void) snprintf(state, sizeof (state), "degraded"); break; case VDEV_STATE_HEALTHY: (void) snprintf(state, sizeof (state), "healthy"); break; default: (void) snprintf(state, sizeof (state), "", (uint_t)vd->vdev_state); } zfs_dbgmsg("%*svdev %u: %s%s, guid: %llu, path: %s, %s", indent, "", (int)vd->vdev_id, vd->vdev_ops->vdev_op_type, vd->vdev_islog ? " (log)" : "", (u_longlong_t)vd->vdev_guid, vd->vdev_path ? vd->vdev_path : "N/A", state); for (uint64_t i = 0; i < vd->vdev_children; i++) vdev_dbgmsg_print_tree(vd->vdev_child[i], indent + 2); } /* * Virtual device management. */ static vdev_ops_t *const vdev_ops_table[] = { &vdev_root_ops, &vdev_raidz_ops, &vdev_draid_ops, &vdev_draid_spare_ops, &vdev_mirror_ops, &vdev_replacing_ops, &vdev_spare_ops, &vdev_disk_ops, &vdev_file_ops, &vdev_missing_ops, &vdev_hole_ops, &vdev_indirect_ops, NULL }; /* * Given a vdev type, return the appropriate ops vector. */ static vdev_ops_t * vdev_getops(const char *type) { vdev_ops_t *ops, *const *opspp; for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++) if (strcmp(ops->vdev_op_type, type) == 0) break; return (ops); } /* * Given a vdev and a metaslab class, find which metaslab group we're * interested in. All vdevs may belong to two different metaslab classes. * Dedicated slog devices use only the primary metaslab group, rather than a * separate log group. For embedded slogs, the vdev_log_mg will be non-NULL. */ metaslab_group_t * vdev_get_mg(vdev_t *vd, metaslab_class_t *mc) { if (mc == spa_embedded_log_class(vd->vdev_spa) && vd->vdev_log_mg != NULL) return (vd->vdev_log_mg); else return (vd->vdev_mg); } void vdev_default_xlate(vdev_t *vd, const range_seg64_t *logical_rs, range_seg64_t *physical_rs, range_seg64_t *remain_rs) { (void) vd, (void) remain_rs; physical_rs->rs_start = logical_rs->rs_start; physical_rs->rs_end = logical_rs->rs_end; } /* * Derive the enumerated allocation bias from string input. * String origin is either the per-vdev zap or zpool(8). */ static vdev_alloc_bias_t vdev_derive_alloc_bias(const char *bias) { vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE; if (strcmp(bias, VDEV_ALLOC_BIAS_LOG) == 0) alloc_bias = VDEV_BIAS_LOG; else if (strcmp(bias, VDEV_ALLOC_BIAS_SPECIAL) == 0) alloc_bias = VDEV_BIAS_SPECIAL; else if (strcmp(bias, VDEV_ALLOC_BIAS_DEDUP) == 0) alloc_bias = VDEV_BIAS_DEDUP; return (alloc_bias); } /* * Default asize function: return the MAX of psize with the asize of * all children. This is what's used by anything other than RAID-Z. */ uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize) { uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift); uint64_t csize; for (int c = 0; c < vd->vdev_children; c++) { csize = vdev_psize_to_asize(vd->vdev_child[c], psize); asize = MAX(asize, csize); } return (asize); } uint64_t vdev_default_min_asize(vdev_t *vd) { return (vd->vdev_min_asize); } /* * Get the minimum allocatable size. We define the allocatable size as * the vdev's asize rounded to the nearest metaslab. This allows us to * replace or attach devices which don't have the same physical size but * can still satisfy the same number of allocations. */ uint64_t vdev_get_min_asize(vdev_t *vd) { vdev_t *pvd = vd->vdev_parent; /* * If our parent is NULL (inactive spare or cache) or is the root, * just return our own asize. */ if (pvd == NULL) return (vd->vdev_asize); /* * The top-level vdev just returns the allocatable size rounded * to the nearest metaslab. */ if (vd == vd->vdev_top) return (P2ALIGN(vd->vdev_asize, 1ULL << vd->vdev_ms_shift)); return (pvd->vdev_ops->vdev_op_min_asize(pvd)); } void vdev_set_min_asize(vdev_t *vd) { vd->vdev_min_asize = vdev_get_min_asize(vd); for (int c = 0; c < vd->vdev_children; c++) vdev_set_min_asize(vd->vdev_child[c]); } /* * Get the minimal allocation size for the top-level vdev. */ uint64_t vdev_get_min_alloc(vdev_t *vd) { uint64_t min_alloc = 1ULL << vd->vdev_ashift; if (vd->vdev_ops->vdev_op_min_alloc != NULL) min_alloc = vd->vdev_ops->vdev_op_min_alloc(vd); return (min_alloc); } /* * Get the parity level for a top-level vdev. */ uint64_t vdev_get_nparity(vdev_t *vd) { uint64_t nparity = 0; if (vd->vdev_ops->vdev_op_nparity != NULL) nparity = vd->vdev_ops->vdev_op_nparity(vd); return (nparity); } /* * Get the number of data disks for a top-level vdev. */ uint64_t vdev_get_ndisks(vdev_t *vd) { uint64_t ndisks = 1; if (vd->vdev_ops->vdev_op_ndisks != NULL) ndisks = vd->vdev_ops->vdev_op_ndisks(vd); return (ndisks); } vdev_t * vdev_lookup_top(spa_t *spa, uint64_t vdev) { vdev_t *rvd = spa->spa_root_vdev; ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); if (vdev < rvd->vdev_children) { ASSERT(rvd->vdev_child[vdev] != NULL); return (rvd->vdev_child[vdev]); } return (NULL); } vdev_t * vdev_lookup_by_guid(vdev_t *vd, uint64_t guid) { vdev_t *mvd; if (vd->vdev_guid == guid) return (vd); for (int c = 0; c < vd->vdev_children; c++) if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) != NULL) return (mvd); return (NULL); } static int vdev_count_leaves_impl(vdev_t *vd) { int n = 0; if (vd->vdev_ops->vdev_op_leaf) return (1); for (int c = 0; c < vd->vdev_children; c++) n += vdev_count_leaves_impl(vd->vdev_child[c]); return (n); } int vdev_count_leaves(spa_t *spa) { int rc; spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); rc = vdev_count_leaves_impl(spa->spa_root_vdev); spa_config_exit(spa, SCL_VDEV, FTAG); return (rc); } void vdev_add_child(vdev_t *pvd, vdev_t *cvd) { size_t oldsize, newsize; uint64_t id = cvd->vdev_id; vdev_t **newchild; ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); ASSERT(cvd->vdev_parent == NULL); cvd->vdev_parent = pvd; if (pvd == NULL) return; ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL); oldsize = pvd->vdev_children * sizeof (vdev_t *); pvd->vdev_children = MAX(pvd->vdev_children, id + 1); newsize = pvd->vdev_children * sizeof (vdev_t *); newchild = kmem_alloc(newsize, KM_SLEEP); if (pvd->vdev_child != NULL) { memcpy(newchild, pvd->vdev_child, oldsize); kmem_free(pvd->vdev_child, oldsize); } pvd->vdev_child = newchild; pvd->vdev_child[id] = cvd; cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd); ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL); /* * Walk up all ancestors to update guid sum. */ for (; pvd != NULL; pvd = pvd->vdev_parent) pvd->vdev_guid_sum += cvd->vdev_guid_sum; if (cvd->vdev_ops->vdev_op_leaf) { list_insert_head(&cvd->vdev_spa->spa_leaf_list, cvd); cvd->vdev_spa->spa_leaf_list_gen++; } } void vdev_remove_child(vdev_t *pvd, vdev_t *cvd) { int c; uint_t id = cvd->vdev_id; ASSERT(cvd->vdev_parent == pvd); if (pvd == NULL) return; ASSERT(id < pvd->vdev_children); ASSERT(pvd->vdev_child[id] == cvd); pvd->vdev_child[id] = NULL; cvd->vdev_parent = NULL; for (c = 0; c < pvd->vdev_children; c++) if (pvd->vdev_child[c]) break; if (c == pvd->vdev_children) { kmem_free(pvd->vdev_child, c * sizeof (vdev_t *)); pvd->vdev_child = NULL; pvd->vdev_children = 0; } if (cvd->vdev_ops->vdev_op_leaf) { spa_t *spa = cvd->vdev_spa; list_remove(&spa->spa_leaf_list, cvd); spa->spa_leaf_list_gen++; } /* * Walk up all ancestors to update guid sum. */ for (; pvd != NULL; pvd = pvd->vdev_parent) pvd->vdev_guid_sum -= cvd->vdev_guid_sum; } /* * Remove any holes in the child array. */ void vdev_compact_children(vdev_t *pvd) { vdev_t **newchild, *cvd; int oldc = pvd->vdev_children; int newc; ASSERT(spa_config_held(pvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); if (oldc == 0) return; for (int c = newc = 0; c < oldc; c++) if (pvd->vdev_child[c]) newc++; if (newc > 0) { newchild = kmem_zalloc(newc * sizeof (vdev_t *), KM_SLEEP); for (int c = newc = 0; c < oldc; c++) { if ((cvd = pvd->vdev_child[c]) != NULL) { newchild[newc] = cvd; cvd->vdev_id = newc++; } } } else { newchild = NULL; } kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *)); pvd->vdev_child = newchild; pvd->vdev_children = newc; } /* * Allocate and minimally initialize a vdev_t. */ vdev_t * vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) { vdev_t *vd; vdev_indirect_config_t *vic; vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP); vic = &vd->vdev_indirect_config; if (spa->spa_root_vdev == NULL) { ASSERT(ops == &vdev_root_ops); spa->spa_root_vdev = vd; spa->spa_load_guid = spa_generate_guid(NULL); } if (guid == 0 && ops != &vdev_hole_ops) { if (spa->spa_root_vdev == vd) { /* * The root vdev's guid will also be the pool guid, * which must be unique among all pools. */ guid = spa_generate_guid(NULL); } else { /* * Any other vdev's guid must be unique within the pool. */ guid = spa_generate_guid(spa); } ASSERT(!spa_guid_exists(spa_guid(spa), guid)); } vd->vdev_spa = spa; vd->vdev_id = id; vd->vdev_guid = guid; vd->vdev_guid_sum = guid; vd->vdev_ops = ops; vd->vdev_state = VDEV_STATE_CLOSED; vd->vdev_ishole = (ops == &vdev_hole_ops); vic->vic_prev_indirect_vdev = UINT64_MAX; rw_init(&vd->vdev_indirect_rwlock, NULL, RW_DEFAULT, NULL); mutex_init(&vd->vdev_obsolete_lock, NULL, MUTEX_DEFAULT, NULL); vd->vdev_obsolete_segments = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); /* * Initialize rate limit structs for events. We rate limit ZIO delay * and checksum events so that we don't overwhelm ZED with thousands * of events when a disk is acting up. */ zfs_ratelimit_init(&vd->vdev_delay_rl, &zfs_slow_io_events_per_second, 1); zfs_ratelimit_init(&vd->vdev_deadman_rl, &zfs_slow_io_events_per_second, 1); zfs_ratelimit_init(&vd->vdev_checksum_rl, &zfs_checksum_events_per_second, 1); list_link_init(&vd->vdev_config_dirty_node); list_link_init(&vd->vdev_state_dirty_node); list_link_init(&vd->vdev_initialize_node); list_link_init(&vd->vdev_leaf_node); list_link_init(&vd->vdev_trim_node); mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_NOLOCKDEP, NULL); mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_scan_io_queue_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_initialize_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_initialize_io_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&vd->vdev_initialize_cv, NULL, CV_DEFAULT, NULL); cv_init(&vd->vdev_initialize_io_cv, NULL, CV_DEFAULT, NULL); mutex_init(&vd->vdev_trim_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_autotrim_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_trim_io_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&vd->vdev_trim_cv, NULL, CV_DEFAULT, NULL); cv_init(&vd->vdev_autotrim_cv, NULL, CV_DEFAULT, NULL); cv_init(&vd->vdev_trim_io_cv, NULL, CV_DEFAULT, NULL); mutex_init(&vd->vdev_rebuild_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&vd->vdev_rebuild_cv, NULL, CV_DEFAULT, NULL); for (int t = 0; t < DTL_TYPES; t++) { vd->vdev_dtl[t] = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); } txg_list_create(&vd->vdev_ms_list, spa, offsetof(struct metaslab, ms_txg_node)); txg_list_create(&vd->vdev_dtl_list, spa, offsetof(struct vdev, vdev_dtl_node)); vd->vdev_stat.vs_timestamp = gethrtime(); vdev_queue_init(vd); vdev_cache_init(vd); return (vd); } /* * Allocate a new vdev. The 'alloctype' is used to control whether we are * creating a new vdev or loading an existing one - the behavior is slightly * different for each case. */ int vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, int alloctype) { vdev_ops_t *ops; char *type; uint64_t guid = 0, islog; vdev_t *vd; vdev_indirect_config_t *vic; char *tmp = NULL; int rc; vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE; boolean_t top_level = (parent && !parent->vdev_parent); ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0) return (SET_ERROR(EINVAL)); if ((ops = vdev_getops(type)) == NULL) return (SET_ERROR(EINVAL)); /* * If this is a load, get the vdev guid from the nvlist. * Otherwise, vdev_alloc_common() will generate one for us. */ if (alloctype == VDEV_ALLOC_LOAD) { uint64_t label_id; if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) || label_id != id) return (SET_ERROR(EINVAL)); if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) return (SET_ERROR(EINVAL)); } else if (alloctype == VDEV_ALLOC_SPARE) { if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) return (SET_ERROR(EINVAL)); } else if (alloctype == VDEV_ALLOC_L2CACHE) { if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) return (SET_ERROR(EINVAL)); } else if (alloctype == VDEV_ALLOC_ROOTPOOL) { if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) return (SET_ERROR(EINVAL)); } /* * The first allocated vdev must be of type 'root'. */ if (ops != &vdev_root_ops && spa->spa_root_vdev == NULL) return (SET_ERROR(EINVAL)); /* * Determine whether we're a log vdev. */ islog = 0; (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &islog); if (islog && spa_version(spa) < SPA_VERSION_SLOGS) return (SET_ERROR(ENOTSUP)); if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES) return (SET_ERROR(ENOTSUP)); if (top_level && alloctype == VDEV_ALLOC_ADD) { char *bias; /* * If creating a top-level vdev, check for allocation * classes input. */ if (nvlist_lookup_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS, &bias) == 0) { alloc_bias = vdev_derive_alloc_bias(bias); /* spa_vdev_add() expects feature to be enabled */ if (spa->spa_load_state != SPA_LOAD_CREATE && !spa_feature_is_enabled(spa, SPA_FEATURE_ALLOCATION_CLASSES)) { return (SET_ERROR(ENOTSUP)); } } /* spa_vdev_add() expects feature to be enabled */ if (ops == &vdev_draid_ops && spa->spa_load_state != SPA_LOAD_CREATE && !spa_feature_is_enabled(spa, SPA_FEATURE_DRAID)) { return (SET_ERROR(ENOTSUP)); } } /* * Initialize the vdev specific data. This is done before calling * vdev_alloc_common() since it may fail and this simplifies the * error reporting and cleanup code paths. */ void *tsd = NULL; if (ops->vdev_op_init != NULL) { rc = ops->vdev_op_init(spa, nv, &tsd); if (rc != 0) { return (rc); } } vd = vdev_alloc_common(spa, id, guid, ops); vd->vdev_tsd = tsd; vd->vdev_islog = islog; if (top_level && alloc_bias != VDEV_BIAS_NONE) vd->vdev_alloc_bias = alloc_bias; if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0) vd->vdev_path = spa_strdup(vd->vdev_path); /* * ZPOOL_CONFIG_AUX_STATE = "external" means we previously forced a * fault on a vdev and want it to persist across imports (like with * zpool offline -f). */ rc = nvlist_lookup_string(nv, ZPOOL_CONFIG_AUX_STATE, &tmp); if (rc == 0 && tmp != NULL && strcmp(tmp, "external") == 0) { vd->vdev_stat.vs_aux = VDEV_AUX_EXTERNAL; vd->vdev_faulted = 1; vd->vdev_label_aux = VDEV_AUX_EXTERNAL; } if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0) vd->vdev_devid = spa_strdup(vd->vdev_devid); if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH, &vd->vdev_physpath) == 0) vd->vdev_physpath = spa_strdup(vd->vdev_physpath); if (nvlist_lookup_string(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH, &vd->vdev_enc_sysfs_path) == 0) vd->vdev_enc_sysfs_path = spa_strdup(vd->vdev_enc_sysfs_path); if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &vd->vdev_fru) == 0) vd->vdev_fru = spa_strdup(vd->vdev_fru); /* * Set the whole_disk property. If it's not specified, leave the value * as -1. */ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, &vd->vdev_wholedisk) != 0) vd->vdev_wholedisk = -1ULL; vic = &vd->vdev_indirect_config; ASSERT0(vic->vic_mapping_object); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_OBJECT, &vic->vic_mapping_object); ASSERT0(vic->vic_births_object); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_BIRTHS, &vic->vic_births_object); ASSERT3U(vic->vic_prev_indirect_vdev, ==, UINT64_MAX); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_PREV_INDIRECT_VDEV, &vic->vic_prev_indirect_vdev); /* * Look for the 'not present' flag. This will only be set if the device * was not present at the time of import. */ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, &vd->vdev_not_present); /* * Get the alignment requirement. */ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift); /* * Retrieve the vdev creation time. */ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, &vd->vdev_crtxg); /* * If we're a top-level vdev, try to load the allocation parameters. */ if (top_level && (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY, &vd->vdev_ms_array); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT, &vd->vdev_ms_shift); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE, &vd->vdev_asize); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NONALLOCATING, &vd->vdev_noalloc); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVING, &vd->vdev_removing); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_TOP_ZAP, &vd->vdev_top_zap); } else { ASSERT0(vd->vdev_top_zap); } if (top_level && alloctype != VDEV_ALLOC_ATTACH) { ASSERT(alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_ADD || alloctype == VDEV_ALLOC_SPLIT || alloctype == VDEV_ALLOC_ROOTPOOL); /* Note: metaslab_group_create() is now deferred */ } if (vd->vdev_ops->vdev_op_leaf && (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_LEAF_ZAP, &vd->vdev_leaf_zap); } else { ASSERT0(vd->vdev_leaf_zap); } /* * If we're a leaf vdev, try to load the DTL object and other state. */ if (vd->vdev_ops->vdev_op_leaf && (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE || alloctype == VDEV_ALLOC_ROOTPOOL)) { if (alloctype == VDEV_ALLOC_LOAD) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL, &vd->vdev_dtl_object); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE, &vd->vdev_unspare); } if (alloctype == VDEV_ALLOC_ROOTPOOL) { uint64_t spare = 0; if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE, &spare) == 0 && spare) spa_spare_add(vd); } (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, &vd->vdev_offline); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG, &vd->vdev_resilver_txg); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REBUILD_TXG, &vd->vdev_rebuild_txg); if (nvlist_exists(nv, ZPOOL_CONFIG_RESILVER_DEFER)) vdev_defer_resilver(vd); /* * In general, when importing a pool we want to ignore the * persistent fault state, as the diagnosis made on another * system may not be valid in the current context. The only * exception is if we forced a vdev to a persistently faulted * state with 'zpool offline -f'. The persistent fault will * remain across imports until cleared. * * Local vdevs will remain in the faulted state. */ if (spa_load_state(spa) == SPA_LOAD_OPEN || spa_load_state(spa) == SPA_LOAD_IMPORT) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED, &vd->vdev_faulted); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED, &vd->vdev_degraded); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED, &vd->vdev_removed); if (vd->vdev_faulted || vd->vdev_degraded) { char *aux; vd->vdev_label_aux = VDEV_AUX_ERR_EXCEEDED; if (nvlist_lookup_string(nv, ZPOOL_CONFIG_AUX_STATE, &aux) == 0 && strcmp(aux, "external") == 0) vd->vdev_label_aux = VDEV_AUX_EXTERNAL; else vd->vdev_faulted = 0ULL; } } } /* * Add ourselves to the parent's list of children. */ vdev_add_child(parent, vd); *vdp = vd; return (0); } void vdev_free(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT3P(vd->vdev_initialize_thread, ==, NULL); ASSERT3P(vd->vdev_trim_thread, ==, NULL); ASSERT3P(vd->vdev_autotrim_thread, ==, NULL); ASSERT3P(vd->vdev_rebuild_thread, ==, NULL); /* * Scan queues are normally destroyed at the end of a scan. If the * queue exists here, that implies the vdev is being removed while * the scan is still running. */ if (vd->vdev_scan_io_queue != NULL) { mutex_enter(&vd->vdev_scan_io_queue_lock); dsl_scan_io_queue_destroy(vd->vdev_scan_io_queue); vd->vdev_scan_io_queue = NULL; mutex_exit(&vd->vdev_scan_io_queue_lock); } /* * vdev_free() implies closing the vdev first. This is simpler than * trying to ensure complicated semantics for all callers. */ vdev_close(vd); ASSERT(!list_link_active(&vd->vdev_config_dirty_node)); ASSERT(!list_link_active(&vd->vdev_state_dirty_node)); /* * Free all children. */ for (int c = 0; c < vd->vdev_children; c++) vdev_free(vd->vdev_child[c]); ASSERT(vd->vdev_child == NULL); ASSERT(vd->vdev_guid_sum == vd->vdev_guid); if (vd->vdev_ops->vdev_op_fini != NULL) vd->vdev_ops->vdev_op_fini(vd); /* * Discard allocation state. */ if (vd->vdev_mg != NULL) { vdev_metaslab_fini(vd); metaslab_group_destroy(vd->vdev_mg); vd->vdev_mg = NULL; } if (vd->vdev_log_mg != NULL) { ASSERT0(vd->vdev_ms_count); metaslab_group_destroy(vd->vdev_log_mg); vd->vdev_log_mg = NULL; } ASSERT0(vd->vdev_stat.vs_space); ASSERT0(vd->vdev_stat.vs_dspace); ASSERT0(vd->vdev_stat.vs_alloc); /* * Remove this vdev from its parent's child list. */ vdev_remove_child(vd->vdev_parent, vd); ASSERT(vd->vdev_parent == NULL); ASSERT(!list_link_active(&vd->vdev_leaf_node)); /* * Clean up vdev structure. */ vdev_queue_fini(vd); vdev_cache_fini(vd); if (vd->vdev_path) spa_strfree(vd->vdev_path); if (vd->vdev_devid) spa_strfree(vd->vdev_devid); if (vd->vdev_physpath) spa_strfree(vd->vdev_physpath); if (vd->vdev_enc_sysfs_path) spa_strfree(vd->vdev_enc_sysfs_path); if (vd->vdev_fru) spa_strfree(vd->vdev_fru); if (vd->vdev_isspare) spa_spare_remove(vd); if (vd->vdev_isl2cache) spa_l2cache_remove(vd); txg_list_destroy(&vd->vdev_ms_list); txg_list_destroy(&vd->vdev_dtl_list); mutex_enter(&vd->vdev_dtl_lock); space_map_close(vd->vdev_dtl_sm); for (int t = 0; t < DTL_TYPES; t++) { range_tree_vacate(vd->vdev_dtl[t], NULL, NULL); range_tree_destroy(vd->vdev_dtl[t]); } mutex_exit(&vd->vdev_dtl_lock); EQUIV(vd->vdev_indirect_births != NULL, vd->vdev_indirect_mapping != NULL); if (vd->vdev_indirect_births != NULL) { vdev_indirect_mapping_close(vd->vdev_indirect_mapping); vdev_indirect_births_close(vd->vdev_indirect_births); } if (vd->vdev_obsolete_sm != NULL) { ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops); space_map_close(vd->vdev_obsolete_sm); vd->vdev_obsolete_sm = NULL; } range_tree_destroy(vd->vdev_obsolete_segments); rw_destroy(&vd->vdev_indirect_rwlock); mutex_destroy(&vd->vdev_obsolete_lock); mutex_destroy(&vd->vdev_dtl_lock); mutex_destroy(&vd->vdev_stat_lock); mutex_destroy(&vd->vdev_probe_lock); mutex_destroy(&vd->vdev_scan_io_queue_lock); mutex_destroy(&vd->vdev_initialize_lock); mutex_destroy(&vd->vdev_initialize_io_lock); cv_destroy(&vd->vdev_initialize_io_cv); cv_destroy(&vd->vdev_initialize_cv); mutex_destroy(&vd->vdev_trim_lock); mutex_destroy(&vd->vdev_autotrim_lock); mutex_destroy(&vd->vdev_trim_io_lock); cv_destroy(&vd->vdev_trim_cv); cv_destroy(&vd->vdev_autotrim_cv); cv_destroy(&vd->vdev_trim_io_cv); mutex_destroy(&vd->vdev_rebuild_lock); cv_destroy(&vd->vdev_rebuild_cv); zfs_ratelimit_fini(&vd->vdev_delay_rl); zfs_ratelimit_fini(&vd->vdev_deadman_rl); zfs_ratelimit_fini(&vd->vdev_checksum_rl); if (vd == spa->spa_root_vdev) spa->spa_root_vdev = NULL; kmem_free(vd, sizeof (vdev_t)); } /* * Transfer top-level vdev state from svd to tvd. */ static void vdev_top_transfer(vdev_t *svd, vdev_t *tvd) { spa_t *spa = svd->vdev_spa; metaslab_t *msp; vdev_t *vd; int t; ASSERT(tvd == tvd->vdev_top); tvd->vdev_pending_fastwrite = svd->vdev_pending_fastwrite; tvd->vdev_ms_array = svd->vdev_ms_array; tvd->vdev_ms_shift = svd->vdev_ms_shift; tvd->vdev_ms_count = svd->vdev_ms_count; tvd->vdev_top_zap = svd->vdev_top_zap; svd->vdev_ms_array = 0; svd->vdev_ms_shift = 0; svd->vdev_ms_count = 0; svd->vdev_top_zap = 0; if (tvd->vdev_mg) ASSERT3P(tvd->vdev_mg, ==, svd->vdev_mg); if (tvd->vdev_log_mg) ASSERT3P(tvd->vdev_log_mg, ==, svd->vdev_log_mg); tvd->vdev_mg = svd->vdev_mg; tvd->vdev_log_mg = svd->vdev_log_mg; tvd->vdev_ms = svd->vdev_ms; svd->vdev_mg = NULL; svd->vdev_log_mg = NULL; svd->vdev_ms = NULL; if (tvd->vdev_mg != NULL) tvd->vdev_mg->mg_vd = tvd; if (tvd->vdev_log_mg != NULL) tvd->vdev_log_mg->mg_vd = tvd; tvd->vdev_checkpoint_sm = svd->vdev_checkpoint_sm; svd->vdev_checkpoint_sm = NULL; tvd->vdev_alloc_bias = svd->vdev_alloc_bias; svd->vdev_alloc_bias = VDEV_BIAS_NONE; tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc; tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space; tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace; svd->vdev_stat.vs_alloc = 0; svd->vdev_stat.vs_space = 0; svd->vdev_stat.vs_dspace = 0; /* * State which may be set on a top-level vdev that's in the * process of being removed. */ ASSERT0(tvd->vdev_indirect_config.vic_births_object); ASSERT0(tvd->vdev_indirect_config.vic_mapping_object); ASSERT3U(tvd->vdev_indirect_config.vic_prev_indirect_vdev, ==, -1ULL); ASSERT3P(tvd->vdev_indirect_mapping, ==, NULL); ASSERT3P(tvd->vdev_indirect_births, ==, NULL); ASSERT3P(tvd->vdev_obsolete_sm, ==, NULL); ASSERT0(tvd->vdev_noalloc); ASSERT0(tvd->vdev_removing); ASSERT0(tvd->vdev_rebuilding); tvd->vdev_noalloc = svd->vdev_noalloc; tvd->vdev_removing = svd->vdev_removing; tvd->vdev_rebuilding = svd->vdev_rebuilding; tvd->vdev_rebuild_config = svd->vdev_rebuild_config; tvd->vdev_indirect_config = svd->vdev_indirect_config; tvd->vdev_indirect_mapping = svd->vdev_indirect_mapping; tvd->vdev_indirect_births = svd->vdev_indirect_births; range_tree_swap(&svd->vdev_obsolete_segments, &tvd->vdev_obsolete_segments); tvd->vdev_obsolete_sm = svd->vdev_obsolete_sm; svd->vdev_indirect_config.vic_mapping_object = 0; svd->vdev_indirect_config.vic_births_object = 0; svd->vdev_indirect_config.vic_prev_indirect_vdev = -1ULL; svd->vdev_indirect_mapping = NULL; svd->vdev_indirect_births = NULL; svd->vdev_obsolete_sm = NULL; svd->vdev_noalloc = 0; svd->vdev_removing = 0; svd->vdev_rebuilding = 0; for (t = 0; t < TXG_SIZE; t++) { while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL) (void) txg_list_add(&tvd->vdev_ms_list, msp, t); while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL) (void) txg_list_add(&tvd->vdev_dtl_list, vd, t); if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t)) (void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t); } if (list_link_active(&svd->vdev_config_dirty_node)) { vdev_config_clean(svd); vdev_config_dirty(tvd); } if (list_link_active(&svd->vdev_state_dirty_node)) { vdev_state_clean(svd); vdev_state_dirty(tvd); } tvd->vdev_deflate_ratio = svd->vdev_deflate_ratio; svd->vdev_deflate_ratio = 0; tvd->vdev_islog = svd->vdev_islog; svd->vdev_islog = 0; dsl_scan_io_queue_vdev_xfer(svd, tvd); } static void vdev_top_update(vdev_t *tvd, vdev_t *vd) { if (vd == NULL) return; vd->vdev_top = tvd; for (int c = 0; c < vd->vdev_children; c++) vdev_top_update(tvd, vd->vdev_child[c]); } /* * Add a mirror/replacing vdev above an existing vdev. There is no need to * call .vdev_op_init() since mirror/replacing vdevs do not have private state. */ vdev_t * vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops) { spa_t *spa = cvd->vdev_spa; vdev_t *pvd = cvd->vdev_parent; vdev_t *mvd; ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops); mvd->vdev_asize = cvd->vdev_asize; mvd->vdev_min_asize = cvd->vdev_min_asize; mvd->vdev_max_asize = cvd->vdev_max_asize; mvd->vdev_psize = cvd->vdev_psize; mvd->vdev_ashift = cvd->vdev_ashift; mvd->vdev_logical_ashift = cvd->vdev_logical_ashift; mvd->vdev_physical_ashift = cvd->vdev_physical_ashift; mvd->vdev_state = cvd->vdev_state; mvd->vdev_crtxg = cvd->vdev_crtxg; vdev_remove_child(pvd, cvd); vdev_add_child(pvd, mvd); cvd->vdev_id = mvd->vdev_children; vdev_add_child(mvd, cvd); vdev_top_update(cvd->vdev_top, cvd->vdev_top); if (mvd == mvd->vdev_top) vdev_top_transfer(cvd, mvd); return (mvd); } /* * Remove a 1-way mirror/replacing vdev from the tree. */ void vdev_remove_parent(vdev_t *cvd) { vdev_t *mvd = cvd->vdev_parent; vdev_t *pvd = mvd->vdev_parent; ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); ASSERT(mvd->vdev_children == 1); ASSERT(mvd->vdev_ops == &vdev_mirror_ops || mvd->vdev_ops == &vdev_replacing_ops || mvd->vdev_ops == &vdev_spare_ops); cvd->vdev_ashift = mvd->vdev_ashift; cvd->vdev_logical_ashift = mvd->vdev_logical_ashift; cvd->vdev_physical_ashift = mvd->vdev_physical_ashift; vdev_remove_child(mvd, cvd); vdev_remove_child(pvd, mvd); /* * If cvd will replace mvd as a top-level vdev, preserve mvd's guid. * Otherwise, we could have detached an offline device, and when we * go to import the pool we'll think we have two top-level vdevs, * instead of a different version of the same top-level vdev. */ if (mvd->vdev_top == mvd) { uint64_t guid_delta = mvd->vdev_guid - cvd->vdev_guid; cvd->vdev_orig_guid = cvd->vdev_guid; cvd->vdev_guid += guid_delta; cvd->vdev_guid_sum += guid_delta; /* * If pool not set for autoexpand, we need to also preserve * mvd's asize to prevent automatic expansion of cvd. * Otherwise if we are adjusting the mirror by attaching and * detaching children of non-uniform sizes, the mirror could * autoexpand, unexpectedly requiring larger devices to * re-establish the mirror. */ if (!cvd->vdev_spa->spa_autoexpand) cvd->vdev_asize = mvd->vdev_asize; } cvd->vdev_id = mvd->vdev_id; vdev_add_child(pvd, cvd); vdev_top_update(cvd->vdev_top, cvd->vdev_top); if (cvd == cvd->vdev_top) vdev_top_transfer(mvd, cvd); ASSERT(mvd->vdev_children == 0); vdev_free(mvd); } void vdev_metaslab_group_create(vdev_t *vd) { spa_t *spa = vd->vdev_spa; /* * metaslab_group_create was delayed until allocation bias was available */ if (vd->vdev_mg == NULL) { metaslab_class_t *mc; if (vd->vdev_islog && vd->vdev_alloc_bias == VDEV_BIAS_NONE) vd->vdev_alloc_bias = VDEV_BIAS_LOG; ASSERT3U(vd->vdev_islog, ==, (vd->vdev_alloc_bias == VDEV_BIAS_LOG)); switch (vd->vdev_alloc_bias) { case VDEV_BIAS_LOG: mc = spa_log_class(spa); break; case VDEV_BIAS_SPECIAL: mc = spa_special_class(spa); break; case VDEV_BIAS_DEDUP: mc = spa_dedup_class(spa); break; default: mc = spa_normal_class(spa); } vd->vdev_mg = metaslab_group_create(mc, vd, spa->spa_alloc_count); if (!vd->vdev_islog) { vd->vdev_log_mg = metaslab_group_create( spa_embedded_log_class(spa), vd, 1); } /* * The spa ashift min/max only apply for the normal metaslab * class. Class destination is late binding so ashift boundary * setting had to wait until now. */ if (vd->vdev_top == vd && vd->vdev_ashift != 0 && mc == spa_normal_class(spa) && vd->vdev_aux == NULL) { if (vd->vdev_ashift > spa->spa_max_ashift) spa->spa_max_ashift = vd->vdev_ashift; if (vd->vdev_ashift < spa->spa_min_ashift) spa->spa_min_ashift = vd->vdev_ashift; uint64_t min_alloc = vdev_get_min_alloc(vd); if (min_alloc < spa->spa_min_alloc) spa->spa_min_alloc = min_alloc; } } } int vdev_metaslab_init(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; uint64_t oldc = vd->vdev_ms_count; uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift; metaslab_t **mspp; int error; boolean_t expanding = (oldc != 0); ASSERT(txg == 0 || spa_config_held(spa, SCL_ALLOC, RW_WRITER)); /* * This vdev is not being allocated from yet or is a hole. */ if (vd->vdev_ms_shift == 0) return (0); ASSERT(!vd->vdev_ishole); ASSERT(oldc <= newc); mspp = vmem_zalloc(newc * sizeof (*mspp), KM_SLEEP); if (expanding) { memcpy(mspp, vd->vdev_ms, oldc * sizeof (*mspp)); vmem_free(vd->vdev_ms, oldc * sizeof (*mspp)); } vd->vdev_ms = mspp; vd->vdev_ms_count = newc; for (uint64_t m = oldc; m < newc; m++) { uint64_t object = 0; /* * vdev_ms_array may be 0 if we are creating the "fake" * metaslabs for an indirect vdev for zdb's leak detection. * See zdb_leak_init(). */ if (txg == 0 && vd->vdev_ms_array != 0) { error = dmu_read(spa->spa_meta_objset, vd->vdev_ms_array, m * sizeof (uint64_t), sizeof (uint64_t), &object, DMU_READ_PREFETCH); if (error != 0) { vdev_dbgmsg(vd, "unable to read the metaslab " "array [error=%d]", error); return (error); } } error = metaslab_init(vd->vdev_mg, m, object, txg, &(vd->vdev_ms[m])); if (error != 0) { vdev_dbgmsg(vd, "metaslab_init failed [error=%d]", error); return (error); } } /* * Find the emptiest metaslab on the vdev and mark it for use for * embedded slog by moving it from the regular to the log metaslab * group. */ if (vd->vdev_mg->mg_class == spa_normal_class(spa) && vd->vdev_ms_count > zfs_embedded_slog_min_ms && avl_is_empty(&vd->vdev_log_mg->mg_metaslab_tree)) { uint64_t slog_msid = 0; uint64_t smallest = UINT64_MAX; /* * Note, we only search the new metaslabs, because the old * (pre-existing) ones may be active (e.g. have non-empty * range_tree's), and we don't move them to the new * metaslab_t. */ for (uint64_t m = oldc; m < newc; m++) { uint64_t alloc = space_map_allocated(vd->vdev_ms[m]->ms_sm); if (alloc < smallest) { slog_msid = m; smallest = alloc; } } metaslab_t *slog_ms = vd->vdev_ms[slog_msid]; /* * The metaslab was marked as dirty at the end of * metaslab_init(). Remove it from the dirty list so that we * can uninitialize and reinitialize it to the new class. */ if (txg != 0) { (void) txg_list_remove_this(&vd->vdev_ms_list, slog_ms, txg); } uint64_t sm_obj = space_map_object(slog_ms->ms_sm); metaslab_fini(slog_ms); VERIFY0(metaslab_init(vd->vdev_log_mg, slog_msid, sm_obj, txg, &vd->vdev_ms[slog_msid])); } if (txg == 0) spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER); /* * If the vdev is marked as non-allocating then don't * activate the metaslabs since we want to ensure that * no allocations are performed on this device. */ if (vd->vdev_noalloc) { /* track non-allocating vdev space */ spa->spa_nonallocating_dspace += spa_deflate(spa) ? vd->vdev_stat.vs_dspace : vd->vdev_stat.vs_space; } else if (!expanding) { metaslab_group_activate(vd->vdev_mg); if (vd->vdev_log_mg != NULL) metaslab_group_activate(vd->vdev_log_mg); } if (txg == 0) spa_config_exit(spa, SCL_ALLOC, FTAG); return (0); } void vdev_metaslab_fini(vdev_t *vd) { if (vd->vdev_checkpoint_sm != NULL) { ASSERT(spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_POOL_CHECKPOINT)); space_map_close(vd->vdev_checkpoint_sm); /* * Even though we close the space map, we need to set its * pointer to NULL. The reason is that vdev_metaslab_fini() * may be called multiple times for certain operations * (i.e. when destroying a pool) so we need to ensure that * this clause never executes twice. This logic is similar * to the one used for the vdev_ms clause below. */ vd->vdev_checkpoint_sm = NULL; } if (vd->vdev_ms != NULL) { metaslab_group_t *mg = vd->vdev_mg; metaslab_group_passivate(mg); if (vd->vdev_log_mg != NULL) { ASSERT(!vd->vdev_islog); metaslab_group_passivate(vd->vdev_log_mg); } uint64_t count = vd->vdev_ms_count; for (uint64_t m = 0; m < count; m++) { metaslab_t *msp = vd->vdev_ms[m]; if (msp != NULL) metaslab_fini(msp); } vmem_free(vd->vdev_ms, count * sizeof (metaslab_t *)); vd->vdev_ms = NULL; vd->vdev_ms_count = 0; for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) { ASSERT0(mg->mg_histogram[i]); if (vd->vdev_log_mg != NULL) ASSERT0(vd->vdev_log_mg->mg_histogram[i]); } } ASSERT0(vd->vdev_ms_count); ASSERT3U(vd->vdev_pending_fastwrite, ==, 0); } typedef struct vdev_probe_stats { boolean_t vps_readable; boolean_t vps_writeable; int vps_flags; } vdev_probe_stats_t; static void vdev_probe_done(zio_t *zio) { spa_t *spa = zio->io_spa; vdev_t *vd = zio->io_vd; vdev_probe_stats_t *vps = zio->io_private; ASSERT(vd->vdev_probe_zio != NULL); if (zio->io_type == ZIO_TYPE_READ) { if (zio->io_error == 0) vps->vps_readable = 1; if (zio->io_error == 0 && spa_writeable(spa)) { zio_nowait(zio_write_phys(vd->vdev_probe_zio, vd, zio->io_offset, zio->io_size, zio->io_abd, ZIO_CHECKSUM_OFF, vdev_probe_done, vps, ZIO_PRIORITY_SYNC_WRITE, vps->vps_flags, B_TRUE)); } else { abd_free(zio->io_abd); } } else if (zio->io_type == ZIO_TYPE_WRITE) { if (zio->io_error == 0) vps->vps_writeable = 1; abd_free(zio->io_abd); } else if (zio->io_type == ZIO_TYPE_NULL) { zio_t *pio; zio_link_t *zl; vd->vdev_cant_read |= !vps->vps_readable; vd->vdev_cant_write |= !vps->vps_writeable; if (vdev_readable(vd) && (vdev_writeable(vd) || !spa_writeable(spa))) { zio->io_error = 0; } else { ASSERT(zio->io_error != 0); vdev_dbgmsg(vd, "failed probe"); (void) zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE, spa, vd, NULL, NULL, 0); zio->io_error = SET_ERROR(ENXIO); } mutex_enter(&vd->vdev_probe_lock); ASSERT(vd->vdev_probe_zio == zio); vd->vdev_probe_zio = NULL; mutex_exit(&vd->vdev_probe_lock); zl = NULL; while ((pio = zio_walk_parents(zio, &zl)) != NULL) if (!vdev_accessible(vd, pio)) pio->io_error = SET_ERROR(ENXIO); kmem_free(vps, sizeof (*vps)); } } /* * Determine whether this device is accessible. * * Read and write to several known locations: the pad regions of each * vdev label but the first, which we leave alone in case it contains * a VTOC. */ zio_t * vdev_probe(vdev_t *vd, zio_t *zio) { spa_t *spa = vd->vdev_spa; vdev_probe_stats_t *vps = NULL; zio_t *pio; ASSERT(vd->vdev_ops->vdev_op_leaf); /* * Don't probe the probe. */ if (zio && (zio->io_flags & ZIO_FLAG_PROBE)) return (NULL); /* * To prevent 'probe storms' when a device fails, we create * just one probe i/o at a time. All zios that want to probe * this vdev will become parents of the probe io. */ mutex_enter(&vd->vdev_probe_lock); if ((pio = vd->vdev_probe_zio) == NULL) { vps = kmem_zalloc(sizeof (*vps), KM_SLEEP); vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE | ZIO_FLAG_TRYHARD; if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) { /* * vdev_cant_read and vdev_cant_write can only * transition from TRUE to FALSE when we have the * SCL_ZIO lock as writer; otherwise they can only * transition from FALSE to TRUE. This ensures that * any zio looking at these values can assume that * failures persist for the life of the I/O. That's * important because when a device has intermittent * connectivity problems, we want to ensure that * they're ascribed to the device (ENXIO) and not * the zio (EIO). * * Since we hold SCL_ZIO as writer here, clear both * values so the probe can reevaluate from first * principles. */ vps->vps_flags |= ZIO_FLAG_CONFIG_WRITER; vd->vdev_cant_read = B_FALSE; vd->vdev_cant_write = B_FALSE; } vd->vdev_probe_zio = pio = zio_null(NULL, spa, vd, vdev_probe_done, vps, vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE); /* * We can't change the vdev state in this context, so we * kick off an async task to do it on our behalf. */ if (zio != NULL) { vd->vdev_probe_wanted = B_TRUE; spa_async_request(spa, SPA_ASYNC_PROBE); } } if (zio != NULL) zio_add_child(zio, pio); mutex_exit(&vd->vdev_probe_lock); if (vps == NULL) { ASSERT(zio != NULL); return (NULL); } for (int l = 1; l < VDEV_LABELS; l++) { zio_nowait(zio_read_phys(pio, vd, vdev_label_offset(vd->vdev_psize, l, offsetof(vdev_label_t, vl_be)), VDEV_PAD_SIZE, abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE), ZIO_CHECKSUM_OFF, vdev_probe_done, vps, ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE)); } if (zio == NULL) return (pio); zio_nowait(pio); return (NULL); } static void vdev_load_child(void *arg) { vdev_t *vd = arg; vd->vdev_load_error = vdev_load(vd); } static void vdev_open_child(void *arg) { vdev_t *vd = arg; vd->vdev_open_thread = curthread; vd->vdev_open_error = vdev_open(vd); vd->vdev_open_thread = NULL; } static boolean_t vdev_uses_zvols(vdev_t *vd) { #ifdef _KERNEL if (zvol_is_zvol(vd->vdev_path)) return (B_TRUE); #endif for (int c = 0; c < vd->vdev_children; c++) if (vdev_uses_zvols(vd->vdev_child[c])) return (B_TRUE); return (B_FALSE); } /* * Returns B_TRUE if the passed child should be opened. */ static boolean_t vdev_default_open_children_func(vdev_t *vd) { (void) vd; return (B_TRUE); } /* * Open the requested child vdevs. If any of the leaf vdevs are using * a ZFS volume then do the opens in a single thread. This avoids a * deadlock when the current thread is holding the spa_namespace_lock. */ static void vdev_open_children_impl(vdev_t *vd, vdev_open_children_func_t *open_func) { int children = vd->vdev_children; taskq_t *tq = taskq_create("vdev_open", children, minclsyspri, children, children, TASKQ_PREPOPULATE); vd->vdev_nonrot = B_TRUE; for (int c = 0; c < children; c++) { vdev_t *cvd = vd->vdev_child[c]; if (open_func(cvd) == B_FALSE) continue; if (tq == NULL || vdev_uses_zvols(vd)) { cvd->vdev_open_error = vdev_open(cvd); } else { VERIFY(taskq_dispatch(tq, vdev_open_child, cvd, TQ_SLEEP) != TASKQID_INVALID); } vd->vdev_nonrot &= cvd->vdev_nonrot; } if (tq != NULL) { taskq_wait(tq); taskq_destroy(tq); } } /* * Open all child vdevs. */ void vdev_open_children(vdev_t *vd) { vdev_open_children_impl(vd, vdev_default_open_children_func); } /* * Conditionally open a subset of child vdevs. */ void vdev_open_children_subset(vdev_t *vd, vdev_open_children_func_t *open_func) { vdev_open_children_impl(vd, open_func); } /* * Compute the raidz-deflation ratio. Note, we hard-code * in 128k (1 << 17) because it is the "typical" blocksize. * Even though SPA_MAXBLOCKSIZE changed, this algorithm can not change, * otherwise it would inconsistently account for existing bp's. */ static void vdev_set_deflate_ratio(vdev_t *vd) { if (vd == vd->vdev_top && !vd->vdev_ishole && vd->vdev_ashift != 0) { vd->vdev_deflate_ratio = (1 << 17) / (vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT); } } /* * Choose the best of two ashifts, preferring one between logical ashift * (absolute minimum) and administrator defined maximum, otherwise take * the biggest of the two. */ uint64_t vdev_best_ashift(uint64_t logical, uint64_t a, uint64_t b) { if (a > logical && a <= zfs_vdev_max_auto_ashift) { if (b <= logical || b > zfs_vdev_max_auto_ashift) return (a); else return (MAX(a, b)); } else if (b <= logical || b > zfs_vdev_max_auto_ashift) return (MAX(a, b)); return (b); } /* * Maximize performance by inflating the configured ashift for top level * vdevs to be as close to the physical ashift as possible while maintaining * administrator defined limits and ensuring it doesn't go below the * logical ashift. */ static void vdev_ashift_optimize(vdev_t *vd) { ASSERT(vd == vd->vdev_top); if (vd->vdev_ashift < vd->vdev_physical_ashift && vd->vdev_physical_ashift <= zfs_vdev_max_auto_ashift) { vd->vdev_ashift = MIN( MAX(zfs_vdev_max_auto_ashift, vd->vdev_ashift), MAX(zfs_vdev_min_auto_ashift, vd->vdev_physical_ashift)); } else { /* * If the logical and physical ashifts are the same, then * we ensure that the top-level vdev's ashift is not smaller * than our minimum ashift value. For the unusual case * where logical ashift > physical ashift, we can't cap * the calculated ashift based on max ashift as that * would cause failures. * We still check if we need to increase it to match * the min ashift. */ vd->vdev_ashift = MAX(zfs_vdev_min_auto_ashift, vd->vdev_ashift); } } /* * Prepare a virtual device for access. */ int vdev_open(vdev_t *vd) { spa_t *spa = vd->vdev_spa; int error; uint64_t osize = 0; uint64_t max_osize = 0; uint64_t asize, max_asize, psize; uint64_t logical_ashift = 0; uint64_t physical_ashift = 0; ASSERT(vd->vdev_open_thread == curthread || spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); ASSERT(vd->vdev_state == VDEV_STATE_CLOSED || vd->vdev_state == VDEV_STATE_CANT_OPEN || vd->vdev_state == VDEV_STATE_OFFLINE); vd->vdev_stat.vs_aux = VDEV_AUX_NONE; vd->vdev_cant_read = B_FALSE; vd->vdev_cant_write = B_FALSE; vd->vdev_min_asize = vdev_get_min_asize(vd); /* * If this vdev is not removed, check its fault status. If it's * faulted, bail out of the open. */ if (!vd->vdev_removed && vd->vdev_faulted) { ASSERT(vd->vdev_children == 0); ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED || vd->vdev_label_aux == VDEV_AUX_EXTERNAL); vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, vd->vdev_label_aux); return (SET_ERROR(ENXIO)); } else if (vd->vdev_offline) { ASSERT(vd->vdev_children == 0); vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE); return (SET_ERROR(ENXIO)); } error = vd->vdev_ops->vdev_op_open(vd, &osize, &max_osize, &logical_ashift, &physical_ashift); /* Keep the device in removed state if unplugged */ if (error == ENOENT && vd->vdev_removed) { vdev_set_state(vd, B_TRUE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); return (error); } /* * Physical volume size should never be larger than its max size, unless * the disk has shrunk while we were reading it or the device is buggy * or damaged: either way it's not safe for use, bail out of the open. */ if (osize > max_osize) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_OPEN_FAILED); return (SET_ERROR(ENXIO)); } /* * Reset the vdev_reopening flag so that we actually close * the vdev on error. */ vd->vdev_reopening = B_FALSE; if (zio_injection_enabled && error == 0) error = zio_handle_device_injection(vd, NULL, SET_ERROR(ENXIO)); if (error) { if (vd->vdev_removed && vd->vdev_stat.vs_aux != VDEV_AUX_OPEN_FAILED) vd->vdev_removed = B_FALSE; if (vd->vdev_stat.vs_aux == VDEV_AUX_CHILDREN_OFFLINE) { vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, vd->vdev_stat.vs_aux); } else { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, vd->vdev_stat.vs_aux); } return (error); } vd->vdev_removed = B_FALSE; /* * Recheck the faulted flag now that we have confirmed that * the vdev is accessible. If we're faulted, bail. */ if (vd->vdev_faulted) { ASSERT(vd->vdev_children == 0); ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED || vd->vdev_label_aux == VDEV_AUX_EXTERNAL); vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, vd->vdev_label_aux); return (SET_ERROR(ENXIO)); } if (vd->vdev_degraded) { ASSERT(vd->vdev_children == 0); vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, VDEV_AUX_ERR_EXCEEDED); } else { vdev_set_state(vd, B_TRUE, VDEV_STATE_HEALTHY, 0); } /* * For hole or missing vdevs we just return success. */ if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops) return (0); for (int c = 0; c < vd->vdev_children; c++) { if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) { vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); break; } } osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t)); max_osize = P2ALIGN(max_osize, (uint64_t)sizeof (vdev_label_t)); if (vd->vdev_children == 0) { if (osize < SPA_MINDEVSIZE) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_TOO_SMALL); return (SET_ERROR(EOVERFLOW)); } psize = osize; asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE); max_asize = max_osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE); } else { if (vd->vdev_parent != NULL && osize < SPA_MINDEVSIZE - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_TOO_SMALL); return (SET_ERROR(EOVERFLOW)); } psize = 0; asize = osize; max_asize = max_osize; } /* * If the vdev was expanded, record this so that we can re-create the * uberblock rings in labels {2,3}, during the next sync. */ if ((psize > vd->vdev_psize) && (vd->vdev_psize != 0)) vd->vdev_copy_uberblocks = B_TRUE; vd->vdev_psize = psize; /* * Make sure the allocatable size hasn't shrunk too much. */ if (asize < vd->vdev_min_asize) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_BAD_LABEL); return (SET_ERROR(EINVAL)); } /* * We can always set the logical/physical ashift members since * their values are only used to calculate the vdev_ashift when * the device is first added to the config. These values should * not be used for anything else since they may change whenever * the device is reopened and we don't store them in the label. */ vd->vdev_physical_ashift = MAX(physical_ashift, vd->vdev_physical_ashift); vd->vdev_logical_ashift = MAX(logical_ashift, vd->vdev_logical_ashift); if (vd->vdev_asize == 0) { /* * This is the first-ever open, so use the computed values. * For compatibility, a different ashift can be requested. */ vd->vdev_asize = asize; vd->vdev_max_asize = max_asize; /* * If the vdev_ashift was not overridden at creation time, * then set it the logical ashift and optimize the ashift. */ if (vd->vdev_ashift == 0) { vd->vdev_ashift = vd->vdev_logical_ashift; if (vd->vdev_logical_ashift > ASHIFT_MAX) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_ASHIFT_TOO_BIG); return (SET_ERROR(EDOM)); } if (vd->vdev_top == vd) { vdev_ashift_optimize(vd); } } if (vd->vdev_ashift != 0 && (vd->vdev_ashift < ASHIFT_MIN || vd->vdev_ashift > ASHIFT_MAX)) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_BAD_ASHIFT); return (SET_ERROR(EDOM)); } } else { /* * Make sure the alignment required hasn't increased. */ if (vd->vdev_ashift > vd->vdev_top->vdev_ashift && vd->vdev_ops->vdev_op_leaf) { (void) zfs_ereport_post( FM_EREPORT_ZFS_DEVICE_BAD_ASHIFT, spa, vd, NULL, NULL, 0); vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_BAD_LABEL); return (SET_ERROR(EDOM)); } vd->vdev_max_asize = max_asize; } /* * If all children are healthy we update asize if either: * The asize has increased, due to a device expansion caused by dynamic * LUN growth or vdev replacement, and automatic expansion is enabled; * making the additional space available. * * The asize has decreased, due to a device shrink usually caused by a * vdev replace with a smaller device. This ensures that calculations * based of max_asize and asize e.g. esize are always valid. It's safe * to do this as we've already validated that asize is greater than * vdev_min_asize. */ if (vd->vdev_state == VDEV_STATE_HEALTHY && ((asize > vd->vdev_asize && (vd->vdev_expanding || spa->spa_autoexpand)) || (asize < vd->vdev_asize))) vd->vdev_asize = asize; vdev_set_min_asize(vd); /* * Ensure we can issue some IO before declaring the * vdev open for business. */ if (vd->vdev_ops->vdev_op_leaf && (error = zio_wait(vdev_probe(vd, NULL))) != 0) { vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, VDEV_AUX_ERR_EXCEEDED); return (error); } /* * Track the minimum allocation size. */ if (vd->vdev_top == vd && vd->vdev_ashift != 0 && vd->vdev_islog == 0 && vd->vdev_aux == NULL) { uint64_t min_alloc = vdev_get_min_alloc(vd); if (min_alloc < spa->spa_min_alloc) spa->spa_min_alloc = min_alloc; } /* * If this is a leaf vdev, assess whether a resilver is needed. * But don't do this if we are doing a reopen for a scrub, since * this would just restart the scrub we are already doing. */ if (vd->vdev_ops->vdev_op_leaf && !spa->spa_scrub_reopen) dsl_scan_assess_vdev(spa->spa_dsl_pool, vd); return (0); } static void vdev_validate_child(void *arg) { vdev_t *vd = arg; vd->vdev_validate_thread = curthread; vd->vdev_validate_error = vdev_validate(vd); vd->vdev_validate_thread = NULL; } /* * Called once the vdevs are all opened, this routine validates the label * contents. This needs to be done before vdev_load() so that we don't * inadvertently do repair I/Os to the wrong device. * * This function will only return failure if one of the vdevs indicates that it * has since been destroyed or exported. This is only possible if * /etc/zfs/zpool.cache was readonly at the time. Otherwise, the vdev state * will be updated but the function will return 0. */ int vdev_validate(vdev_t *vd) { spa_t *spa = vd->vdev_spa; taskq_t *tq = NULL; nvlist_t *label; uint64_t guid = 0, aux_guid = 0, top_guid; uint64_t state; nvlist_t *nvl; uint64_t txg; int children = vd->vdev_children; if (vdev_validate_skip) return (0); if (children > 0) { tq = taskq_create("vdev_validate", children, minclsyspri, children, children, TASKQ_PREPOPULATE); } for (uint64_t c = 0; c < children; c++) { vdev_t *cvd = vd->vdev_child[c]; if (tq == NULL || vdev_uses_zvols(cvd)) { vdev_validate_child(cvd); } else { VERIFY(taskq_dispatch(tq, vdev_validate_child, cvd, TQ_SLEEP) != TASKQID_INVALID); } } if (tq != NULL) { taskq_wait(tq); taskq_destroy(tq); } for (int c = 0; c < children; c++) { int error = vd->vdev_child[c]->vdev_validate_error; if (error != 0) return (SET_ERROR(EBADF)); } /* * If the device has already failed, or was marked offline, don't do * any further validation. Otherwise, label I/O will fail and we will * overwrite the previous state. */ if (!vd->vdev_ops->vdev_op_leaf || !vdev_readable(vd)) return (0); /* * If we are performing an extreme rewind, we allow for a label that * was modified at a point after the current txg. * If config lock is not held do not check for the txg. spa_sync could * be updating the vdev's label before updating spa_last_synced_txg. */ if (spa->spa_extreme_rewind || spa_last_synced_txg(spa) == 0 || spa_config_held(spa, SCL_CONFIG, RW_WRITER) != SCL_CONFIG) txg = UINT64_MAX; else txg = spa_last_synced_txg(spa); if ((label = vdev_label_read_config(vd, txg)) == NULL) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_BAD_LABEL); vdev_dbgmsg(vd, "vdev_validate: failed reading config for " "txg %llu", (u_longlong_t)txg); return (0); } /* * Determine if this vdev has been split off into another * pool. If so, then refuse to open it. */ if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_SPLIT_GUID, &aux_guid) == 0 && aux_guid == spa_guid(spa)) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_SPLIT_POOL); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: vdev split into other pool"); return (0); } if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, &guid) != 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label", ZPOOL_CONFIG_POOL_GUID); return (0); } /* * If config is not trusted then ignore the spa guid check. This is * necessary because if the machine crashed during a re-guid the new * guid might have been written to all of the vdev labels, but not the * cached config. The check will be performed again once we have the * trusted config from the MOS. */ if (spa->spa_trust_config && guid != spa_guid(spa)) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: vdev label pool_guid doesn't " "match config (%llu != %llu)", (u_longlong_t)guid, (u_longlong_t)spa_guid(spa)); return (0); } if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 || nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_ORIG_GUID, &aux_guid) != 0) aux_guid = 0; if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label", ZPOOL_CONFIG_GUID); return (0); } if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_TOP_GUID, &top_guid) != 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label", ZPOOL_CONFIG_TOP_GUID); return (0); } /* * If this vdev just became a top-level vdev because its sibling was * detached, it will have adopted the parent's vdev guid -- but the * label may or may not be on disk yet. Fortunately, either version * of the label will have the same top guid, so if we're a top-level * vdev, we can safely compare to that instead. * However, if the config comes from a cachefile that failed to update * after the detach, a top-level vdev will appear as a non top-level * vdev in the config. Also relax the constraints if we perform an * extreme rewind. * * If we split this vdev off instead, then we also check the * original pool's guid. We don't want to consider the vdev * corrupt if it is partway through a split operation. */ if (vd->vdev_guid != guid && vd->vdev_guid != aux_guid) { boolean_t mismatch = B_FALSE; if (spa->spa_trust_config && !spa->spa_extreme_rewind) { if (vd != vd->vdev_top || vd->vdev_guid != top_guid) mismatch = B_TRUE; } else { if (vd->vdev_guid != top_guid && vd->vdev_top->vdev_guid != guid) mismatch = B_TRUE; } if (mismatch) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: config guid " "doesn't match label guid"); vdev_dbgmsg(vd, "CONFIG: guid %llu, top_guid %llu", (u_longlong_t)vd->vdev_guid, (u_longlong_t)vd->vdev_top->vdev_guid); vdev_dbgmsg(vd, "LABEL: guid %llu, top_guid %llu, " "aux_guid %llu", (u_longlong_t)guid, (u_longlong_t)top_guid, (u_longlong_t)aux_guid); return (0); } } if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label", ZPOOL_CONFIG_POOL_STATE); return (0); } nvlist_free(label); /* * If this is a verbatim import, no need to check the * state of the pool. */ if (!(spa->spa_import_flags & ZFS_IMPORT_VERBATIM) && spa_load_state(spa) == SPA_LOAD_OPEN && state != POOL_STATE_ACTIVE) { vdev_dbgmsg(vd, "vdev_validate: invalid pool state (%llu) " "for spa %s", (u_longlong_t)state, spa->spa_name); return (SET_ERROR(EBADF)); } /* * If we were able to open and validate a vdev that was * previously marked permanently unavailable, clear that state * now. */ if (vd->vdev_not_present) vd->vdev_not_present = 0; return (0); } static void vdev_copy_path_impl(vdev_t *svd, vdev_t *dvd) { char *old, *new; if (svd->vdev_path != NULL && dvd->vdev_path != NULL) { if (strcmp(svd->vdev_path, dvd->vdev_path) != 0) { zfs_dbgmsg("vdev_copy_path: vdev %llu: path changed " "from '%s' to '%s'", (u_longlong_t)dvd->vdev_guid, dvd->vdev_path, svd->vdev_path); spa_strfree(dvd->vdev_path); dvd->vdev_path = spa_strdup(svd->vdev_path); } } else if (svd->vdev_path != NULL) { dvd->vdev_path = spa_strdup(svd->vdev_path); zfs_dbgmsg("vdev_copy_path: vdev %llu: path set to '%s'", (u_longlong_t)dvd->vdev_guid, dvd->vdev_path); } /* * Our enclosure sysfs path may have changed between imports */ old = dvd->vdev_enc_sysfs_path; new = svd->vdev_enc_sysfs_path; if ((old != NULL && new == NULL) || (old == NULL && new != NULL) || ((old != NULL && new != NULL) && strcmp(new, old) != 0)) { zfs_dbgmsg("vdev_copy_path: vdev %llu: vdev_enc_sysfs_path " "changed from '%s' to '%s'", (u_longlong_t)dvd->vdev_guid, old, new); if (dvd->vdev_enc_sysfs_path) spa_strfree(dvd->vdev_enc_sysfs_path); if (svd->vdev_enc_sysfs_path) { dvd->vdev_enc_sysfs_path = spa_strdup( svd->vdev_enc_sysfs_path); } else { dvd->vdev_enc_sysfs_path = NULL; } } } /* * Recursively copy vdev paths from one vdev to another. Source and destination * vdev trees must have same geometry otherwise return error. Intended to copy * paths from userland config into MOS config. */ int vdev_copy_path_strict(vdev_t *svd, vdev_t *dvd) { if ((svd->vdev_ops == &vdev_missing_ops) || (svd->vdev_ishole && dvd->vdev_ishole) || (dvd->vdev_ops == &vdev_indirect_ops)) return (0); if (svd->vdev_ops != dvd->vdev_ops) { vdev_dbgmsg(svd, "vdev_copy_path: vdev type mismatch: %s != %s", svd->vdev_ops->vdev_op_type, dvd->vdev_ops->vdev_op_type); return (SET_ERROR(EINVAL)); } if (svd->vdev_guid != dvd->vdev_guid) { vdev_dbgmsg(svd, "vdev_copy_path: guids mismatch (%llu != " "%llu)", (u_longlong_t)svd->vdev_guid, (u_longlong_t)dvd->vdev_guid); return (SET_ERROR(EINVAL)); } if (svd->vdev_children != dvd->vdev_children) { vdev_dbgmsg(svd, "vdev_copy_path: children count mismatch: " "%llu != %llu", (u_longlong_t)svd->vdev_children, (u_longlong_t)dvd->vdev_children); return (SET_ERROR(EINVAL)); } for (uint64_t i = 0; i < svd->vdev_children; i++) { int error = vdev_copy_path_strict(svd->vdev_child[i], dvd->vdev_child[i]); if (error != 0) return (error); } if (svd->vdev_ops->vdev_op_leaf) vdev_copy_path_impl(svd, dvd); return (0); } static void vdev_copy_path_search(vdev_t *stvd, vdev_t *dvd) { ASSERT(stvd->vdev_top == stvd); ASSERT3U(stvd->vdev_id, ==, dvd->vdev_top->vdev_id); for (uint64_t i = 0; i < dvd->vdev_children; i++) { vdev_copy_path_search(stvd, dvd->vdev_child[i]); } if (!dvd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(dvd)) return; /* * The idea here is that while a vdev can shift positions within * a top vdev (when replacing, attaching mirror, etc.) it cannot * step outside of it. */ vdev_t *vd = vdev_lookup_by_guid(stvd, dvd->vdev_guid); if (vd == NULL || vd->vdev_ops != dvd->vdev_ops) return; ASSERT(vd->vdev_ops->vdev_op_leaf); vdev_copy_path_impl(vd, dvd); } /* * Recursively copy vdev paths from one root vdev to another. Source and * destination vdev trees may differ in geometry. For each destination leaf * vdev, search a vdev with the same guid and top vdev id in the source. * Intended to copy paths from userland config into MOS config. */ void vdev_copy_path_relaxed(vdev_t *srvd, vdev_t *drvd) { uint64_t children = MIN(srvd->vdev_children, drvd->vdev_children); ASSERT(srvd->vdev_ops == &vdev_root_ops); ASSERT(drvd->vdev_ops == &vdev_root_ops); for (uint64_t i = 0; i < children; i++) { vdev_copy_path_search(srvd->vdev_child[i], drvd->vdev_child[i]); } } /* * Close a virtual device. */ void vdev_close(vdev_t *vd) { vdev_t *pvd = vd->vdev_parent; spa_t *spa __maybe_unused = vd->vdev_spa; ASSERT(vd != NULL); ASSERT(vd->vdev_open_thread == curthread || spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); /* * If our parent is reopening, then we are as well, unless we are * going offline. */ if (pvd != NULL && pvd->vdev_reopening) vd->vdev_reopening = (pvd->vdev_reopening && !vd->vdev_offline); vd->vdev_ops->vdev_op_close(vd); vdev_cache_purge(vd); /* * We record the previous state before we close it, so that if we are * doing a reopen(), we don't generate FMA ereports if we notice that * it's still faulted. */ vd->vdev_prevstate = vd->vdev_state; if (vd->vdev_offline) vd->vdev_state = VDEV_STATE_OFFLINE; else vd->vdev_state = VDEV_STATE_CLOSED; vd->vdev_stat.vs_aux = VDEV_AUX_NONE; } void vdev_hold(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(spa_is_root(spa)); if (spa->spa_state == POOL_STATE_UNINITIALIZED) return; for (int c = 0; c < vd->vdev_children; c++) vdev_hold(vd->vdev_child[c]); if (vd->vdev_ops->vdev_op_leaf && vd->vdev_ops->vdev_op_hold != NULL) vd->vdev_ops->vdev_op_hold(vd); } void vdev_rele(vdev_t *vd) { ASSERT(spa_is_root(vd->vdev_spa)); for (int c = 0; c < vd->vdev_children; c++) vdev_rele(vd->vdev_child[c]); if (vd->vdev_ops->vdev_op_leaf && vd->vdev_ops->vdev_op_rele != NULL) vd->vdev_ops->vdev_op_rele(vd); } /* * Reopen all interior vdevs and any unopened leaves. We don't actually * reopen leaf vdevs which had previously been opened as they might deadlock * on the spa_config_lock. Instead we only obtain the leaf's physical size. * If the leaf has never been opened then open it, as usual. */ void vdev_reopen(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); /* set the reopening flag unless we're taking the vdev offline */ vd->vdev_reopening = !vd->vdev_offline; vdev_close(vd); (void) vdev_open(vd); /* * Call vdev_validate() here to make sure we have the same device. * Otherwise, a device with an invalid label could be successfully * opened in response to vdev_reopen(). */ if (vd->vdev_aux) { (void) vdev_validate_aux(vd); if (vdev_readable(vd) && vdev_writeable(vd) && vd->vdev_aux == &spa->spa_l2cache) { /* * In case the vdev is present we should evict all ARC * buffers and pointers to log blocks and reclaim their * space before restoring its contents to L2ARC. */ if (l2arc_vdev_present(vd)) { l2arc_rebuild_vdev(vd, B_TRUE); } else { l2arc_add_vdev(spa, vd); } spa_async_request(spa, SPA_ASYNC_L2CACHE_REBUILD); spa_async_request(spa, SPA_ASYNC_L2CACHE_TRIM); } } else { (void) vdev_validate(vd); } /* * Reassess parent vdev's health. */ vdev_propagate_state(vd); } int vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing) { int error; /* * Normally, partial opens (e.g. of a mirror) are allowed. * For a create, however, we want to fail the request if * there are any components we can't open. */ error = vdev_open(vd); if (error || vd->vdev_state != VDEV_STATE_HEALTHY) { vdev_close(vd); return (error ? error : SET_ERROR(ENXIO)); } /* * Recursively load DTLs and initialize all labels. */ if ((error = vdev_dtl_load(vd)) != 0 || (error = vdev_label_init(vd, txg, isreplacing ? VDEV_LABEL_REPLACE : VDEV_LABEL_CREATE)) != 0) { vdev_close(vd); return (error); } return (0); } void vdev_metaslab_set_size(vdev_t *vd) { uint64_t asize = vd->vdev_asize; uint64_t ms_count = asize >> zfs_vdev_default_ms_shift; uint64_t ms_shift; /* * There are two dimensions to the metaslab sizing calculation: * the size of the metaslab and the count of metaslabs per vdev. * * The default values used below are a good balance between memory * usage (larger metaslab size means more memory needed for loaded * metaslabs; more metaslabs means more memory needed for the * metaslab_t structs), metaslab load time (larger metaslabs take * longer to load), and metaslab sync time (more metaslabs means * more time spent syncing all of them). * * In general, we aim for zfs_vdev_default_ms_count (200) metaslabs. * The range of the dimensions are as follows: * * 2^29 <= ms_size <= 2^34 * 16 <= ms_count <= 131,072 * * On the lower end of vdev sizes, we aim for metaslabs sizes of * at least 512MB (2^29) to minimize fragmentation effects when * testing with smaller devices. However, the count constraint * of at least 16 metaslabs will override this minimum size goal. * * On the upper end of vdev sizes, we aim for a maximum metaslab * size of 16GB. However, we will cap the total count to 2^17 * metaslabs to keep our memory footprint in check and let the * metaslab size grow from there if that limit is hit. * * The net effect of applying above constrains is summarized below. * * vdev size metaslab count * --------------|----------------- * < 8GB ~16 * 8GB - 100GB one per 512MB * 100GB - 3TB ~200 * 3TB - 2PB one per 16GB * > 2PB ~131,072 * -------------------------------- * * Finally, note that all of the above calculate the initial * number of metaslabs. Expanding a top-level vdev will result * in additional metaslabs being allocated making it possible * to exceed the zfs_vdev_ms_count_limit. */ if (ms_count < zfs_vdev_min_ms_count) ms_shift = highbit64(asize / zfs_vdev_min_ms_count); else if (ms_count > zfs_vdev_default_ms_count) ms_shift = highbit64(asize / zfs_vdev_default_ms_count); else ms_shift = zfs_vdev_default_ms_shift; if (ms_shift < SPA_MAXBLOCKSHIFT) { ms_shift = SPA_MAXBLOCKSHIFT; } else if (ms_shift > zfs_vdev_max_ms_shift) { ms_shift = zfs_vdev_max_ms_shift; /* cap the total count to constrain memory footprint */ if ((asize >> ms_shift) > zfs_vdev_ms_count_limit) ms_shift = highbit64(asize / zfs_vdev_ms_count_limit); } vd->vdev_ms_shift = ms_shift; ASSERT3U(vd->vdev_ms_shift, >=, SPA_MAXBLOCKSHIFT); } void vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg) { ASSERT(vd == vd->vdev_top); /* indirect vdevs don't have metaslabs or dtls */ ASSERT(vdev_is_concrete(vd) || flags == 0); ASSERT(ISP2(flags)); ASSERT(spa_writeable(vd->vdev_spa)); if (flags & VDD_METASLAB) (void) txg_list_add(&vd->vdev_ms_list, arg, txg); if (flags & VDD_DTL) (void) txg_list_add(&vd->vdev_dtl_list, arg, txg); (void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg); } void vdev_dirty_leaves(vdev_t *vd, int flags, uint64_t txg) { for (int c = 0; c < vd->vdev_children; c++) vdev_dirty_leaves(vd->vdev_child[c], flags, txg); if (vd->vdev_ops->vdev_op_leaf) vdev_dirty(vd->vdev_top, flags, vd, txg); } /* * DTLs. * * A vdev's DTL (dirty time log) is the set of transaction groups for which * the vdev has less than perfect replication. There are four kinds of DTL: * * DTL_MISSING: txgs for which the vdev has no valid copies of the data * * DTL_PARTIAL: txgs for which data is available, but not fully replicated * * DTL_SCRUB: the txgs that could not be repaired by the last scrub; upon * scrub completion, DTL_SCRUB replaces DTL_MISSING in the range of * txgs that was scrubbed. * * DTL_OUTAGE: txgs which cannot currently be read, whether due to * persistent errors or just some device being offline. * Unlike the other three, the DTL_OUTAGE map is not generally * maintained; it's only computed when needed, typically to * determine whether a device can be detached. * * For leaf vdevs, DTL_MISSING and DTL_PARTIAL are identical: the device * either has the data or it doesn't. * * For interior vdevs such as mirror and RAID-Z the picture is more complex. * A vdev's DTL_PARTIAL is the union of its children's DTL_PARTIALs, because * if any child is less than fully replicated, then so is its parent. * A vdev's DTL_MISSING is a modified union of its children's DTL_MISSINGs, * comprising only those txgs which appear in 'maxfaults' or more children; * those are the txgs we don't have enough replication to read. For example, * double-parity RAID-Z can tolerate up to two missing devices (maxfaults == 2); * thus, its DTL_MISSING consists of the set of txgs that appear in more than * two child DTL_MISSING maps. * * It should be clear from the above that to compute the DTLs and outage maps * for all vdevs, it suffices to know just the leaf vdevs' DTL_MISSING maps. * Therefore, that is all we keep on disk. When loading the pool, or after * a configuration change, we generate all other DTLs from first principles. */ void vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) { range_tree_t *rt = vd->vdev_dtl[t]; ASSERT(t < DTL_TYPES); ASSERT(vd != vd->vdev_spa->spa_root_vdev); ASSERT(spa_writeable(vd->vdev_spa)); mutex_enter(&vd->vdev_dtl_lock); if (!range_tree_contains(rt, txg, size)) range_tree_add(rt, txg, size); mutex_exit(&vd->vdev_dtl_lock); } boolean_t vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) { range_tree_t *rt = vd->vdev_dtl[t]; boolean_t dirty = B_FALSE; ASSERT(t < DTL_TYPES); ASSERT(vd != vd->vdev_spa->spa_root_vdev); /* * While we are loading the pool, the DTLs have not been loaded yet. * This isn't a problem but it can result in devices being tried * which are known to not have the data. In which case, the import * is relying on the checksum to ensure that we get the right data. * Note that while importing we are only reading the MOS, which is * always checksummed. */ mutex_enter(&vd->vdev_dtl_lock); if (!range_tree_is_empty(rt)) dirty = range_tree_contains(rt, txg, size); mutex_exit(&vd->vdev_dtl_lock); return (dirty); } boolean_t vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t) { range_tree_t *rt = vd->vdev_dtl[t]; boolean_t empty; mutex_enter(&vd->vdev_dtl_lock); empty = range_tree_is_empty(rt); mutex_exit(&vd->vdev_dtl_lock); return (empty); } /* * Check if the txg falls within the range which must be * resilvered. DVAs outside this range can always be skipped. */ boolean_t vdev_default_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize, uint64_t phys_birth) { (void) dva, (void) psize; /* Set by sequential resilver. */ if (phys_birth == TXG_UNKNOWN) return (B_TRUE); return (vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1)); } /* * Returns B_TRUE if the vdev determines the DVA needs to be resilvered. */ boolean_t vdev_dtl_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize, uint64_t phys_birth) { ASSERT(vd != vd->vdev_spa->spa_root_vdev); if (vd->vdev_ops->vdev_op_need_resilver == NULL || vd->vdev_ops->vdev_op_leaf) return (B_TRUE); return (vd->vdev_ops->vdev_op_need_resilver(vd, dva, psize, phys_birth)); } /* * Returns the lowest txg in the DTL range. */ static uint64_t vdev_dtl_min(vdev_t *vd) { ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock)); ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0); ASSERT0(vd->vdev_children); return (range_tree_min(vd->vdev_dtl[DTL_MISSING]) - 1); } /* * Returns the highest txg in the DTL. */ static uint64_t vdev_dtl_max(vdev_t *vd) { ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock)); ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0); ASSERT0(vd->vdev_children); return (range_tree_max(vd->vdev_dtl[DTL_MISSING])); } /* * Determine if a resilvering vdev should remove any DTL entries from * its range. If the vdev was resilvering for the entire duration of the * scan then it should excise that range from its DTLs. Otherwise, this * vdev is considered partially resilvered and should leave its DTL * entries intact. The comment in vdev_dtl_reassess() describes how we * excise the DTLs. */ static boolean_t vdev_dtl_should_excise(vdev_t *vd, boolean_t rebuild_done) { ASSERT0(vd->vdev_children); if (vd->vdev_state < VDEV_STATE_DEGRADED) return (B_FALSE); if (vd->vdev_resilver_deferred) return (B_FALSE); if (range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) return (B_TRUE); if (rebuild_done) { vdev_rebuild_t *vr = &vd->vdev_top->vdev_rebuild_config; vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; /* Rebuild not initiated by attach */ if (vd->vdev_rebuild_txg == 0) return (B_TRUE); /* * When a rebuild completes without error then all missing data * up to the rebuild max txg has been reconstructed and the DTL * is eligible for excision. */ if (vrp->vrp_rebuild_state == VDEV_REBUILD_COMPLETE && vdev_dtl_max(vd) <= vrp->vrp_max_txg) { ASSERT3U(vrp->vrp_min_txg, <=, vdev_dtl_min(vd)); ASSERT3U(vrp->vrp_min_txg, <, vd->vdev_rebuild_txg); ASSERT3U(vd->vdev_rebuild_txg, <=, vrp->vrp_max_txg); return (B_TRUE); } } else { dsl_scan_t *scn = vd->vdev_spa->spa_dsl_pool->dp_scan; dsl_scan_phys_t *scnp __maybe_unused = &scn->scn_phys; /* Resilver not initiated by attach */ if (vd->vdev_resilver_txg == 0) return (B_TRUE); /* * When a resilver is initiated the scan will assign the * scn_max_txg value to the highest txg value that exists * in all DTLs. If this device's max DTL is not part of this * scan (i.e. it is not in the range (scn_min_txg, scn_max_txg] * then it is not eligible for excision. */ if (vdev_dtl_max(vd) <= scn->scn_phys.scn_max_txg) { ASSERT3U(scnp->scn_min_txg, <=, vdev_dtl_min(vd)); ASSERT3U(scnp->scn_min_txg, <, vd->vdev_resilver_txg); ASSERT3U(vd->vdev_resilver_txg, <=, scnp->scn_max_txg); return (B_TRUE); } } return (B_FALSE); } /* * Reassess DTLs after a config change or scrub completion. If txg == 0 no * write operations will be issued to the pool. */ void vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, boolean_t scrub_done, boolean_t rebuild_done) { spa_t *spa = vd->vdev_spa; avl_tree_t reftree; int minref; ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); for (int c = 0; c < vd->vdev_children; c++) vdev_dtl_reassess(vd->vdev_child[c], txg, scrub_txg, scrub_done, rebuild_done); if (vd == spa->spa_root_vdev || !vdev_is_concrete(vd) || vd->vdev_aux) return; if (vd->vdev_ops->vdev_op_leaf) { dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; vdev_rebuild_t *vr = &vd->vdev_top->vdev_rebuild_config; boolean_t check_excise = B_FALSE; boolean_t wasempty = B_TRUE; mutex_enter(&vd->vdev_dtl_lock); /* * If requested, pretend the scan or rebuild completed cleanly. */ if (zfs_scan_ignore_errors) { if (scn != NULL) scn->scn_phys.scn_errors = 0; if (vr != NULL) vr->vr_rebuild_phys.vrp_errors = 0; } if (scrub_txg != 0 && !range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) { wasempty = B_FALSE; zfs_dbgmsg("guid:%llu txg:%llu scrub:%llu started:%d " "dtl:%llu/%llu errors:%llu", (u_longlong_t)vd->vdev_guid, (u_longlong_t)txg, (u_longlong_t)scrub_txg, spa->spa_scrub_started, (u_longlong_t)vdev_dtl_min(vd), (u_longlong_t)vdev_dtl_max(vd), (u_longlong_t)(scn ? scn->scn_phys.scn_errors : 0)); } /* * If we've completed a scrub/resilver or a rebuild cleanly * then determine if this vdev should remove any DTLs. We * only want to excise regions on vdevs that were available * during the entire duration of this scan. */ if (rebuild_done && vr != NULL && vr->vr_rebuild_phys.vrp_errors == 0) { check_excise = B_TRUE; } else { if (spa->spa_scrub_started || (scn != NULL && scn->scn_phys.scn_errors == 0)) { check_excise = B_TRUE; } } if (scrub_txg && check_excise && vdev_dtl_should_excise(vd, rebuild_done)) { /* * We completed a scrub, resilver or rebuild up to * scrub_txg. If we did it without rebooting, then * the scrub dtl will be valid, so excise the old * region and fold in the scrub dtl. Otherwise, * leave the dtl as-is if there was an error. * * There's little trick here: to excise the beginning * of the DTL_MISSING map, we put it into a reference * tree and then add a segment with refcnt -1 that * covers the range [0, scrub_txg). This means * that each txg in that range has refcnt -1 or 0. * We then add DTL_SCRUB with a refcnt of 2, so that * entries in the range [0, scrub_txg) will have a * positive refcnt -- either 1 or 2. We then convert * the reference tree into the new DTL_MISSING map. */ space_reftree_create(&reftree); space_reftree_add_map(&reftree, vd->vdev_dtl[DTL_MISSING], 1); space_reftree_add_seg(&reftree, 0, scrub_txg, -1); space_reftree_add_map(&reftree, vd->vdev_dtl[DTL_SCRUB], 2); space_reftree_generate_map(&reftree, vd->vdev_dtl[DTL_MISSING], 1); space_reftree_destroy(&reftree); if (!range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) { zfs_dbgmsg("update DTL_MISSING:%llu/%llu", (u_longlong_t)vdev_dtl_min(vd), (u_longlong_t)vdev_dtl_max(vd)); } else if (!wasempty) { zfs_dbgmsg("DTL_MISSING is now empty"); } } range_tree_vacate(vd->vdev_dtl[DTL_PARTIAL], NULL, NULL); range_tree_walk(vd->vdev_dtl[DTL_MISSING], range_tree_add, vd->vdev_dtl[DTL_PARTIAL]); if (scrub_done) range_tree_vacate(vd->vdev_dtl[DTL_SCRUB], NULL, NULL); range_tree_vacate(vd->vdev_dtl[DTL_OUTAGE], NULL, NULL); if (!vdev_readable(vd)) range_tree_add(vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL); else range_tree_walk(vd->vdev_dtl[DTL_MISSING], range_tree_add, vd->vdev_dtl[DTL_OUTAGE]); /* * If the vdev was resilvering or rebuilding and no longer * has any DTLs then reset the appropriate flag and dirty * the top level so that we persist the change. */ if (txg != 0 && range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]) && range_tree_is_empty(vd->vdev_dtl[DTL_OUTAGE])) { if (vd->vdev_rebuild_txg != 0) { vd->vdev_rebuild_txg = 0; vdev_config_dirty(vd->vdev_top); } else if (vd->vdev_resilver_txg != 0) { vd->vdev_resilver_txg = 0; vdev_config_dirty(vd->vdev_top); } } mutex_exit(&vd->vdev_dtl_lock); if (txg != 0) vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg); return; } mutex_enter(&vd->vdev_dtl_lock); for (int t = 0; t < DTL_TYPES; t++) { /* account for child's outage in parent's missing map */ int s = (t == DTL_MISSING) ? DTL_OUTAGE: t; if (t == DTL_SCRUB) continue; /* leaf vdevs only */ if (t == DTL_PARTIAL) minref = 1; /* i.e. non-zero */ else if (vdev_get_nparity(vd) != 0) minref = vdev_get_nparity(vd) + 1; /* RAID-Z, dRAID */ else minref = vd->vdev_children; /* any kind of mirror */ space_reftree_create(&reftree); for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; mutex_enter(&cvd->vdev_dtl_lock); space_reftree_add_map(&reftree, cvd->vdev_dtl[s], 1); mutex_exit(&cvd->vdev_dtl_lock); } space_reftree_generate_map(&reftree, vd->vdev_dtl[t], minref); space_reftree_destroy(&reftree); } mutex_exit(&vd->vdev_dtl_lock); } /* * Iterate over all the vdevs except spare, and post kobj events */ void vdev_post_kobj_evt(vdev_t *vd) { if (vd->vdev_ops->vdev_op_kobj_evt_post && vd->vdev_kobj_flag == B_FALSE) { vd->vdev_kobj_flag = B_TRUE; vd->vdev_ops->vdev_op_kobj_evt_post(vd); } for (int c = 0; c < vd->vdev_children; c++) vdev_post_kobj_evt(vd->vdev_child[c]); } /* * Iterate over all the vdevs except spare, and clear kobj events */ void vdev_clear_kobj_evt(vdev_t *vd) { vd->vdev_kobj_flag = B_FALSE; for (int c = 0; c < vd->vdev_children; c++) vdev_clear_kobj_evt(vd->vdev_child[c]); } int vdev_dtl_load(vdev_t *vd) { spa_t *spa = vd->vdev_spa; objset_t *mos = spa->spa_meta_objset; range_tree_t *rt; int error = 0; if (vd->vdev_ops->vdev_op_leaf && vd->vdev_dtl_object != 0) { ASSERT(vdev_is_concrete(vd)); /* * If the dtl cannot be sync'd there is no need to open it. */ if (spa->spa_mode == SPA_MODE_READ && !spa->spa_read_spacemaps) return (0); error = space_map_open(&vd->vdev_dtl_sm, mos, vd->vdev_dtl_object, 0, -1ULL, 0); if (error) return (error); ASSERT(vd->vdev_dtl_sm != NULL); rt = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); error = space_map_load(vd->vdev_dtl_sm, rt, SM_ALLOC); if (error == 0) { mutex_enter(&vd->vdev_dtl_lock); range_tree_walk(rt, range_tree_add, vd->vdev_dtl[DTL_MISSING]); mutex_exit(&vd->vdev_dtl_lock); } range_tree_vacate(rt, NULL, NULL); range_tree_destroy(rt); return (error); } for (int c = 0; c < vd->vdev_children; c++) { error = vdev_dtl_load(vd->vdev_child[c]); if (error != 0) break; } return (error); } static void vdev_zap_allocation_data(vdev_t *vd, dmu_tx_t *tx) { spa_t *spa = vd->vdev_spa; objset_t *mos = spa->spa_meta_objset; vdev_alloc_bias_t alloc_bias = vd->vdev_alloc_bias; const char *string; ASSERT(alloc_bias != VDEV_BIAS_NONE); string = (alloc_bias == VDEV_BIAS_LOG) ? VDEV_ALLOC_BIAS_LOG : (alloc_bias == VDEV_BIAS_SPECIAL) ? VDEV_ALLOC_BIAS_SPECIAL : (alloc_bias == VDEV_BIAS_DEDUP) ? VDEV_ALLOC_BIAS_DEDUP : NULL; ASSERT(string != NULL); VERIFY0(zap_add(mos, vd->vdev_top_zap, VDEV_TOP_ZAP_ALLOCATION_BIAS, 1, strlen(string) + 1, string, tx)); if (alloc_bias == VDEV_BIAS_SPECIAL || alloc_bias == VDEV_BIAS_DEDUP) { spa_activate_allocation_classes(spa, tx); } } void vdev_destroy_unlink_zap(vdev_t *vd, uint64_t zapobj, dmu_tx_t *tx) { spa_t *spa = vd->vdev_spa; VERIFY0(zap_destroy(spa->spa_meta_objset, zapobj, tx)); VERIFY0(zap_remove_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps, zapobj, tx)); } uint64_t vdev_create_link_zap(vdev_t *vd, dmu_tx_t *tx) { spa_t *spa = vd->vdev_spa; uint64_t zap = zap_create(spa->spa_meta_objset, DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx); ASSERT(zap != 0); VERIFY0(zap_add_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps, zap, tx)); return (zap); } void vdev_construct_zaps(vdev_t *vd, dmu_tx_t *tx) { if (vd->vdev_ops != &vdev_hole_ops && vd->vdev_ops != &vdev_missing_ops && vd->vdev_ops != &vdev_root_ops && !vd->vdev_top->vdev_removing) { if (vd->vdev_ops->vdev_op_leaf && vd->vdev_leaf_zap == 0) { vd->vdev_leaf_zap = vdev_create_link_zap(vd, tx); } if (vd == vd->vdev_top && vd->vdev_top_zap == 0) { vd->vdev_top_zap = vdev_create_link_zap(vd, tx); if (vd->vdev_alloc_bias != VDEV_BIAS_NONE) vdev_zap_allocation_data(vd, tx); } } for (uint64_t i = 0; i < vd->vdev_children; i++) { vdev_construct_zaps(vd->vdev_child[i], tx); } } static void vdev_dtl_sync(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; range_tree_t *rt = vd->vdev_dtl[DTL_MISSING]; objset_t *mos = spa->spa_meta_objset; range_tree_t *rtsync; dmu_tx_t *tx; uint64_t object = space_map_object(vd->vdev_dtl_sm); ASSERT(vdev_is_concrete(vd)); ASSERT(vd->vdev_ops->vdev_op_leaf); tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); if (vd->vdev_detached || vd->vdev_top->vdev_removing) { mutex_enter(&vd->vdev_dtl_lock); space_map_free(vd->vdev_dtl_sm, tx); space_map_close(vd->vdev_dtl_sm); vd->vdev_dtl_sm = NULL; mutex_exit(&vd->vdev_dtl_lock); /* * We only destroy the leaf ZAP for detached leaves or for * removed log devices. Removed data devices handle leaf ZAP * cleanup later, once cancellation is no longer possible. */ if (vd->vdev_leaf_zap != 0 && (vd->vdev_detached || vd->vdev_top->vdev_islog)) { vdev_destroy_unlink_zap(vd, vd->vdev_leaf_zap, tx); vd->vdev_leaf_zap = 0; } dmu_tx_commit(tx); return; } if (vd->vdev_dtl_sm == NULL) { uint64_t new_object; new_object = space_map_alloc(mos, zfs_vdev_dtl_sm_blksz, tx); VERIFY3U(new_object, !=, 0); VERIFY0(space_map_open(&vd->vdev_dtl_sm, mos, new_object, 0, -1ULL, 0)); ASSERT(vd->vdev_dtl_sm != NULL); } rtsync = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); mutex_enter(&vd->vdev_dtl_lock); range_tree_walk(rt, range_tree_add, rtsync); mutex_exit(&vd->vdev_dtl_lock); space_map_truncate(vd->vdev_dtl_sm, zfs_vdev_dtl_sm_blksz, tx); space_map_write(vd->vdev_dtl_sm, rtsync, SM_ALLOC, SM_NO_VDEVID, tx); range_tree_vacate(rtsync, NULL, NULL); range_tree_destroy(rtsync); /* * If the object for the space map has changed then dirty * the top level so that we update the config. */ if (object != space_map_object(vd->vdev_dtl_sm)) { vdev_dbgmsg(vd, "txg %llu, spa %s, DTL old object %llu, " "new object %llu", (u_longlong_t)txg, spa_name(spa), (u_longlong_t)object, (u_longlong_t)space_map_object(vd->vdev_dtl_sm)); vdev_config_dirty(vd->vdev_top); } dmu_tx_commit(tx); } /* * Determine whether the specified vdev can be offlined/detached/removed * without losing data. */ boolean_t vdev_dtl_required(vdev_t *vd) { spa_t *spa = vd->vdev_spa; vdev_t *tvd = vd->vdev_top; uint8_t cant_read = vd->vdev_cant_read; boolean_t required; ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); if (vd == spa->spa_root_vdev || vd == tvd) return (B_TRUE); /* * Temporarily mark the device as unreadable, and then determine * whether this results in any DTL outages in the top-level vdev. * If not, we can safely offline/detach/remove the device. */ vd->vdev_cant_read = B_TRUE; vdev_dtl_reassess(tvd, 0, 0, B_FALSE, B_FALSE); required = !vdev_dtl_empty(tvd, DTL_OUTAGE); vd->vdev_cant_read = cant_read; vdev_dtl_reassess(tvd, 0, 0, B_FALSE, B_FALSE); if (!required && zio_injection_enabled) { required = !!zio_handle_device_injection(vd, NULL, SET_ERROR(ECHILD)); } return (required); } /* * Determine if resilver is needed, and if so the txg range. */ boolean_t vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp) { boolean_t needed = B_FALSE; uint64_t thismin = UINT64_MAX; uint64_t thismax = 0; if (vd->vdev_children == 0) { mutex_enter(&vd->vdev_dtl_lock); if (!range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]) && vdev_writeable(vd)) { thismin = vdev_dtl_min(vd); thismax = vdev_dtl_max(vd); needed = B_TRUE; } mutex_exit(&vd->vdev_dtl_lock); } else { for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; uint64_t cmin, cmax; if (vdev_resilver_needed(cvd, &cmin, &cmax)) { thismin = MIN(thismin, cmin); thismax = MAX(thismax, cmax); needed = B_TRUE; } } } if (needed && minp) { *minp = thismin; *maxp = thismax; } return (needed); } /* * Gets the checkpoint space map object from the vdev's ZAP. On success sm_obj * will contain either the checkpoint spacemap object or zero if none exists. * All other errors are returned to the caller. */ int vdev_checkpoint_sm_object(vdev_t *vd, uint64_t *sm_obj) { ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER)); if (vd->vdev_top_zap == 0) { *sm_obj = 0; return (0); } int error = zap_lookup(spa_meta_objset(vd->vdev_spa), vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (uint64_t), 1, sm_obj); if (error == ENOENT) { *sm_obj = 0; error = 0; } return (error); } int vdev_load(vdev_t *vd) { int children = vd->vdev_children; int error = 0; taskq_t *tq = NULL; /* * It's only worthwhile to use the taskq for the root vdev, because the * slow part is metaslab_init, and that only happens for top-level * vdevs. */ if (vd->vdev_ops == &vdev_root_ops && vd->vdev_children > 0) { tq = taskq_create("vdev_load", children, minclsyspri, children, children, TASKQ_PREPOPULATE); } /* * Recursively load all children. */ for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; if (tq == NULL || vdev_uses_zvols(cvd)) { cvd->vdev_load_error = vdev_load(cvd); } else { VERIFY(taskq_dispatch(tq, vdev_load_child, cvd, TQ_SLEEP) != TASKQID_INVALID); } } if (tq != NULL) { taskq_wait(tq); taskq_destroy(tq); } for (int c = 0; c < vd->vdev_children; c++) { int error = vd->vdev_child[c]->vdev_load_error; if (error != 0) return (error); } vdev_set_deflate_ratio(vd); /* * On spa_load path, grab the allocation bias from our zap */ if (vd == vd->vdev_top && vd->vdev_top_zap != 0) { spa_t *spa = vd->vdev_spa; char bias_str[64]; error = zap_lookup(spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_ALLOCATION_BIAS, 1, sizeof (bias_str), bias_str); if (error == 0) { ASSERT(vd->vdev_alloc_bias == VDEV_BIAS_NONE); vd->vdev_alloc_bias = vdev_derive_alloc_bias(bias_str); } else if (error != ENOENT) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); vdev_dbgmsg(vd, "vdev_load: zap_lookup(top_zap=%llu) " "failed [error=%d]", (u_longlong_t)vd->vdev_top_zap, error); return (error); } } /* * Load any rebuild state from the top-level vdev zap. */ if (vd == vd->vdev_top && vd->vdev_top_zap != 0) { error = vdev_rebuild_load(vd); if (error && error != ENOTSUP) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); vdev_dbgmsg(vd, "vdev_load: vdev_rebuild_load " "failed [error=%d]", error); return (error); } } /* * If this is a top-level vdev, initialize its metaslabs. */ if (vd == vd->vdev_top && vdev_is_concrete(vd)) { vdev_metaslab_group_create(vd); if (vd->vdev_ashift == 0 || vd->vdev_asize == 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); vdev_dbgmsg(vd, "vdev_load: invalid size. ashift=%llu, " "asize=%llu", (u_longlong_t)vd->vdev_ashift, (u_longlong_t)vd->vdev_asize); return (SET_ERROR(ENXIO)); } error = vdev_metaslab_init(vd, 0); if (error != 0) { vdev_dbgmsg(vd, "vdev_load: metaslab_init failed " "[error=%d]", error); vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); return (error); } uint64_t checkpoint_sm_obj; error = vdev_checkpoint_sm_object(vd, &checkpoint_sm_obj); if (error == 0 && checkpoint_sm_obj != 0) { objset_t *mos = spa_meta_objset(vd->vdev_spa); ASSERT(vd->vdev_asize != 0); ASSERT3P(vd->vdev_checkpoint_sm, ==, NULL); error = space_map_open(&vd->vdev_checkpoint_sm, mos, checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift); if (error != 0) { vdev_dbgmsg(vd, "vdev_load: space_map_open " "failed for checkpoint spacemap (obj %llu) " "[error=%d]", (u_longlong_t)checkpoint_sm_obj, error); return (error); } ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL); /* * Since the checkpoint_sm contains free entries * exclusively we can use space_map_allocated() to * indicate the cumulative checkpointed space that * has been freed. */ vd->vdev_stat.vs_checkpoint_space = -space_map_allocated(vd->vdev_checkpoint_sm); vd->vdev_spa->spa_checkpoint_info.sci_dspace += vd->vdev_stat.vs_checkpoint_space; } else if (error != 0) { vdev_dbgmsg(vd, "vdev_load: failed to retrieve " "checkpoint space map object from vdev ZAP " "[error=%d]", error); return (error); } } /* * If this is a leaf vdev, load its DTL. */ if (vd->vdev_ops->vdev_op_leaf && (error = vdev_dtl_load(vd)) != 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); vdev_dbgmsg(vd, "vdev_load: vdev_dtl_load failed " "[error=%d]", error); return (error); } uint64_t obsolete_sm_object; error = vdev_obsolete_sm_object(vd, &obsolete_sm_object); if (error == 0 && obsolete_sm_object != 0) { objset_t *mos = vd->vdev_spa->spa_meta_objset; ASSERT(vd->vdev_asize != 0); ASSERT3P(vd->vdev_obsolete_sm, ==, NULL); if ((error = space_map_open(&vd->vdev_obsolete_sm, mos, obsolete_sm_object, 0, vd->vdev_asize, 0))) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); vdev_dbgmsg(vd, "vdev_load: space_map_open failed for " "obsolete spacemap (obj %llu) [error=%d]", (u_longlong_t)obsolete_sm_object, error); return (error); } } else if (error != 0) { vdev_dbgmsg(vd, "vdev_load: failed to retrieve obsolete " "space map object from vdev ZAP [error=%d]", error); return (error); } return (0); } /* * The special vdev case is used for hot spares and l2cache devices. Its * sole purpose it to set the vdev state for the associated vdev. To do this, * we make sure that we can open the underlying device, then try to read the * label, and make sure that the label is sane and that it hasn't been * repurposed to another pool. */ int vdev_validate_aux(vdev_t *vd) { nvlist_t *label; uint64_t guid, version; uint64_t state; if (!vdev_readable(vd)) return (0); if ((label = vdev_label_read_config(vd, -1ULL)) == NULL) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); return (-1); } if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, &version) != 0 || !SPA_VERSION_IS_SUPPORTED(version) || nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 || guid != vd->vdev_guid || nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); return (-1); } /* * We don't actually check the pool state here. If it's in fact in * use by another pool, we update this fact on the fly when requested. */ nvlist_free(label); return (0); } static void vdev_destroy_ms_flush_data(vdev_t *vd, dmu_tx_t *tx) { objset_t *mos = spa_meta_objset(vd->vdev_spa); if (vd->vdev_top_zap == 0) return; uint64_t object = 0; int err = zap_lookup(mos, vd->vdev_top_zap, VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1, &object); if (err == ENOENT) return; VERIFY0(err); VERIFY0(dmu_object_free(mos, object, tx)); VERIFY0(zap_remove(mos, vd->vdev_top_zap, VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, tx)); } /* * Free the objects used to store this vdev's spacemaps, and the array * that points to them. */ void vdev_destroy_spacemaps(vdev_t *vd, dmu_tx_t *tx) { if (vd->vdev_ms_array == 0) return; objset_t *mos = vd->vdev_spa->spa_meta_objset; uint64_t array_count = vd->vdev_asize >> vd->vdev_ms_shift; size_t array_bytes = array_count * sizeof (uint64_t); uint64_t *smobj_array = kmem_alloc(array_bytes, KM_SLEEP); VERIFY0(dmu_read(mos, vd->vdev_ms_array, 0, array_bytes, smobj_array, 0)); for (uint64_t i = 0; i < array_count; i++) { uint64_t smobj = smobj_array[i]; if (smobj == 0) continue; space_map_free_obj(mos, smobj, tx); } kmem_free(smobj_array, array_bytes); VERIFY0(dmu_object_free(mos, vd->vdev_ms_array, tx)); vdev_destroy_ms_flush_data(vd, tx); vd->vdev_ms_array = 0; } static void vdev_remove_empty_log(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; ASSERT(vd->vdev_islog); ASSERT(vd == vd->vdev_top); ASSERT3U(txg, ==, spa_syncing_txg(spa)); dmu_tx_t *tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); vdev_destroy_spacemaps(vd, tx); if (vd->vdev_top_zap != 0) { vdev_destroy_unlink_zap(vd, vd->vdev_top_zap, tx); vd->vdev_top_zap = 0; } dmu_tx_commit(tx); } void vdev_sync_done(vdev_t *vd, uint64_t txg) { metaslab_t *msp; boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg)); ASSERT(vdev_is_concrete(vd)); while ((msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg))) != NULL) metaslab_sync_done(msp, txg); if (reassess) { metaslab_sync_reassess(vd->vdev_mg); if (vd->vdev_log_mg != NULL) metaslab_sync_reassess(vd->vdev_log_mg); } } void vdev_sync(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; vdev_t *lvd; metaslab_t *msp; ASSERT3U(txg, ==, spa->spa_syncing_txg); dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); if (range_tree_space(vd->vdev_obsolete_segments) > 0) { ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops); vdev_indirect_sync_obsolete(vd, tx); /* * If the vdev is indirect, it can't have dirty * metaslabs or DTLs. */ if (vd->vdev_ops == &vdev_indirect_ops) { ASSERT(txg_list_empty(&vd->vdev_ms_list, txg)); ASSERT(txg_list_empty(&vd->vdev_dtl_list, txg)); dmu_tx_commit(tx); return; } } ASSERT(vdev_is_concrete(vd)); if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0 && !vd->vdev_removing) { ASSERT(vd == vd->vdev_top); ASSERT0(vd->vdev_indirect_config.vic_mapping_object); vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset, DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx); ASSERT(vd->vdev_ms_array != 0); vdev_config_dirty(vd); } while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) { metaslab_sync(msp, txg); (void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg)); } while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL) vdev_dtl_sync(lvd, txg); /* * If this is an empty log device being removed, destroy the * metadata associated with it. */ if (vd->vdev_islog && vd->vdev_stat.vs_alloc == 0 && vd->vdev_removing) vdev_remove_empty_log(vd, txg); (void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg)); dmu_tx_commit(tx); } uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize) { return (vd->vdev_ops->vdev_op_asize(vd, psize)); } /* * Mark the given vdev faulted. A faulted vdev behaves as if the device could * not be opened, and no I/O is attempted. */ int vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux) { vdev_t *vd, *tvd; spa_vdev_state_enter(spa, SCL_NONE); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV))); if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP))); tvd = vd->vdev_top; /* * If user did a 'zpool offline -f' then make the fault persist across * reboots. */ if (aux == VDEV_AUX_EXTERNAL_PERSIST) { /* * There are two kinds of forced faults: temporary and * persistent. Temporary faults go away at pool import, while * persistent faults stay set. Both types of faults can be * cleared with a zpool clear. * * We tell if a vdev is persistently faulted by looking at the * ZPOOL_CONFIG_AUX_STATE nvpair. If it's set to "external" at * import then it's a persistent fault. Otherwise, it's * temporary. We get ZPOOL_CONFIG_AUX_STATE set to "external" * by setting vd.vdev_stat.vs_aux to VDEV_AUX_EXTERNAL. This * tells vdev_config_generate() (which gets run later) to set * ZPOOL_CONFIG_AUX_STATE to "external" in the nvlist. */ vd->vdev_stat.vs_aux = VDEV_AUX_EXTERNAL; vd->vdev_tmpoffline = B_FALSE; aux = VDEV_AUX_EXTERNAL; } else { vd->vdev_tmpoffline = B_TRUE; } /* * We don't directly use the aux state here, but if we do a * vdev_reopen(), we need this value to be present to remember why we * were faulted. */ vd->vdev_label_aux = aux; /* * Faulted state takes precedence over degraded. */ vd->vdev_delayed_close = B_FALSE; vd->vdev_faulted = 1ULL; vd->vdev_degraded = 0ULL; vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, aux); /* * If this device has the only valid copy of the data, then * back off and simply mark the vdev as degraded instead. */ if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) { vd->vdev_degraded = 1ULL; vd->vdev_faulted = 0ULL; /* * If we reopen the device and it's not dead, only then do we * mark it degraded. */ vdev_reopen(tvd); if (vdev_readable(vd)) vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux); } return (spa_vdev_state_exit(spa, vd, 0)); } /* * Mark the given vdev degraded. A degraded vdev is purely an indication to the * user that something is wrong. The vdev continues to operate as normal as far * as I/O is concerned. */ int vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux) { vdev_t *vd; spa_vdev_state_enter(spa, SCL_NONE); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV))); if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP))); /* * If the vdev is already faulted, then don't do anything. */ if (vd->vdev_faulted || vd->vdev_degraded) return (spa_vdev_state_exit(spa, NULL, 0)); vd->vdev_degraded = 1ULL; if (!vdev_is_dead(vd)) vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux); return (spa_vdev_state_exit(spa, vd, 0)); } int vdev_remove_wanted(spa_t *spa, uint64_t guid) { vdev_t *vd; spa_vdev_state_enter(spa, SCL_NONE); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV))); /* * If the vdev is already removed, then don't do anything. */ if (vd->vdev_removed) return (spa_vdev_state_exit(spa, NULL, 0)); vd->vdev_remove_wanted = B_TRUE; spa_async_request(spa, SPA_ASYNC_REMOVE); return (spa_vdev_state_exit(spa, vd, 0)); } /* * Online the given vdev. * * If 'ZFS_ONLINE_UNSPARE' is set, it implies two things. First, any attached * spare device should be detached when the device finishes resilvering. * Second, the online should be treated like a 'test' online case, so no FMA * events are generated if the device fails to open. */ int vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate) { vdev_t *vd, *tvd, *pvd, *rvd = spa->spa_root_vdev; boolean_t wasoffline; vdev_state_t oldstate; spa_vdev_state_enter(spa, SCL_NONE); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV))); if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP))); wasoffline = (vd->vdev_offline || vd->vdev_tmpoffline); oldstate = vd->vdev_state; tvd = vd->vdev_top; vd->vdev_offline = B_FALSE; vd->vdev_tmpoffline = B_FALSE; vd->vdev_checkremove = !!(flags & ZFS_ONLINE_CHECKREMOVE); vd->vdev_forcefault = !!(flags & ZFS_ONLINE_FORCEFAULT); /* XXX - L2ARC 1.0 does not support expansion */ if (!vd->vdev_aux) { for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) pvd->vdev_expanding = !!((flags & ZFS_ONLINE_EXPAND) || spa->spa_autoexpand); vd->vdev_expansion_time = gethrestime_sec(); } vdev_reopen(tvd); vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE; if (!vd->vdev_aux) { for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) pvd->vdev_expanding = B_FALSE; } if (newstate) *newstate = vd->vdev_state; if ((flags & ZFS_ONLINE_UNSPARE) && !vdev_is_dead(vd) && vd->vdev_parent && vd->vdev_parent->vdev_ops == &vdev_spare_ops && vd->vdev_parent->vdev_child[0] == vd) vd->vdev_unspare = B_TRUE; if ((flags & ZFS_ONLINE_EXPAND) || spa->spa_autoexpand) { /* XXX - L2ARC 1.0 does not support expansion */ if (vd->vdev_aux) return (spa_vdev_state_exit(spa, vd, ENOTSUP)); spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); } /* Restart initializing if necessary */ mutex_enter(&vd->vdev_initialize_lock); if (vdev_writeable(vd) && vd->vdev_initialize_thread == NULL && vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE) { (void) vdev_initialize(vd); } mutex_exit(&vd->vdev_initialize_lock); /* * Restart trimming if necessary. We do not restart trimming for cache * devices here. This is triggered by l2arc_rebuild_vdev() * asynchronously for the whole device or in l2arc_evict() as it evicts * space for upcoming writes. */ mutex_enter(&vd->vdev_trim_lock); if (vdev_writeable(vd) && !vd->vdev_isl2cache && vd->vdev_trim_thread == NULL && vd->vdev_trim_state == VDEV_TRIM_ACTIVE) { (void) vdev_trim(vd, vd->vdev_trim_rate, vd->vdev_trim_partial, vd->vdev_trim_secure); } mutex_exit(&vd->vdev_trim_lock); if (wasoffline || (oldstate < VDEV_STATE_DEGRADED && vd->vdev_state >= VDEV_STATE_DEGRADED)) spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_ONLINE); return (spa_vdev_state_exit(spa, vd, 0)); } static int vdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags) { vdev_t *vd, *tvd; int error = 0; uint64_t generation; metaslab_group_t *mg; top: spa_vdev_state_enter(spa, SCL_ALLOC); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV))); if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP))); if (vd->vdev_ops == &vdev_draid_spare_ops) return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); tvd = vd->vdev_top; mg = tvd->vdev_mg; generation = spa->spa_config_generation + 1; /* * If the device isn't already offline, try to offline it. */ if (!vd->vdev_offline) { /* * If this device has the only valid copy of some data, * don't allow it to be offlined. Log devices are always * expendable. */ if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(EBUSY))); /* * If the top-level is a slog and it has had allocations * then proceed. We check that the vdev's metaslab group * is not NULL since it's possible that we may have just * added this vdev but not yet initialized its metaslabs. */ if (tvd->vdev_islog && mg != NULL) { /* * Prevent any future allocations. */ ASSERT3P(tvd->vdev_log_mg, ==, NULL); metaslab_group_passivate(mg); (void) spa_vdev_state_exit(spa, vd, 0); error = spa_reset_logs(spa); /* * If the log device was successfully reset but has * checkpointed data, do not offline it. */ if (error == 0 && tvd->vdev_checkpoint_sm != NULL) { ASSERT3U(space_map_allocated( tvd->vdev_checkpoint_sm), !=, 0); error = ZFS_ERR_CHECKPOINT_EXISTS; } spa_vdev_state_enter(spa, SCL_ALLOC); /* * Check to see if the config has changed. */ if (error || generation != spa->spa_config_generation) { metaslab_group_activate(mg); if (error) return (spa_vdev_state_exit(spa, vd, error)); (void) spa_vdev_state_exit(spa, vd, 0); goto top; } ASSERT0(tvd->vdev_stat.vs_alloc); } /* * Offline this device and reopen its top-level vdev. * If the top-level vdev is a log device then just offline * it. Otherwise, if this action results in the top-level * vdev becoming unusable, undo it and fail the request. */ vd->vdev_offline = B_TRUE; vdev_reopen(tvd); if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_is_dead(tvd)) { vd->vdev_offline = B_FALSE; vdev_reopen(tvd); return (spa_vdev_state_exit(spa, NULL, SET_ERROR(EBUSY))); } /* * Add the device back into the metaslab rotor so that * once we online the device it's open for business. */ if (tvd->vdev_islog && mg != NULL) metaslab_group_activate(mg); } vd->vdev_tmpoffline = !!(flags & ZFS_OFFLINE_TEMPORARY); return (spa_vdev_state_exit(spa, vd, 0)); } int vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags) { int error; mutex_enter(&spa->spa_vdev_top_lock); error = vdev_offline_locked(spa, guid, flags); mutex_exit(&spa->spa_vdev_top_lock); return (error); } /* * Clear the error counts associated with this vdev. Unlike vdev_online() and * vdev_offline(), we assume the spa config is locked. We also clear all * children. If 'vd' is NULL, then the user wants to clear all vdevs. */ void vdev_clear(spa_t *spa, vdev_t *vd) { vdev_t *rvd = spa->spa_root_vdev; ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); if (vd == NULL) vd = rvd; vd->vdev_stat.vs_read_errors = 0; vd->vdev_stat.vs_write_errors = 0; vd->vdev_stat.vs_checksum_errors = 0; vd->vdev_stat.vs_slow_ios = 0; for (int c = 0; c < vd->vdev_children; c++) vdev_clear(spa, vd->vdev_child[c]); /* * It makes no sense to "clear" an indirect vdev. */ if (!vdev_is_concrete(vd)) return; /* * If we're in the FAULTED state or have experienced failed I/O, then * clear the persistent state and attempt to reopen the device. We * also mark the vdev config dirty, so that the new faulted state is * written out to disk. */ if (vd->vdev_faulted || vd->vdev_degraded || !vdev_readable(vd) || !vdev_writeable(vd)) { /* * When reopening in response to a clear event, it may be due to * a fmadm repair request. In this case, if the device is * still broken, we want to still post the ereport again. */ vd->vdev_forcefault = B_TRUE; vd->vdev_faulted = vd->vdev_degraded = 0ULL; vd->vdev_cant_read = B_FALSE; vd->vdev_cant_write = B_FALSE; vd->vdev_stat.vs_aux = 0; vdev_reopen(vd == rvd ? rvd : vd->vdev_top); vd->vdev_forcefault = B_FALSE; if (vd != rvd && vdev_writeable(vd->vdev_top)) vdev_state_dirty(vd->vdev_top); /* If a resilver isn't required, check if vdevs can be culled */ if (vd->vdev_aux == NULL && !vdev_is_dead(vd) && !dsl_scan_resilvering(spa->spa_dsl_pool) && !dsl_scan_resilver_scheduled(spa->spa_dsl_pool)) spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_CLEAR); } /* * When clearing a FMA-diagnosed fault, we always want to * unspare the device, as we assume that the original spare was * done in response to the FMA fault. */ if (!vdev_is_dead(vd) && vd->vdev_parent != NULL && vd->vdev_parent->vdev_ops == &vdev_spare_ops && vd->vdev_parent->vdev_child[0] == vd) vd->vdev_unspare = B_TRUE; /* Clear recent error events cache (i.e. duplicate events tracking) */ zfs_ereport_clear(spa, vd); } boolean_t vdev_is_dead(vdev_t *vd) { /* * Holes and missing devices are always considered "dead". * This simplifies the code since we don't have to check for * these types of devices in the various code paths. * Instead we rely on the fact that we skip over dead devices * before issuing I/O to them. */ return (vd->vdev_state < VDEV_STATE_DEGRADED || vd->vdev_ops == &vdev_hole_ops || vd->vdev_ops == &vdev_missing_ops); } boolean_t vdev_readable(vdev_t *vd) { return (!vdev_is_dead(vd) && !vd->vdev_cant_read); } boolean_t vdev_writeable(vdev_t *vd) { return (!vdev_is_dead(vd) && !vd->vdev_cant_write && vdev_is_concrete(vd)); } boolean_t vdev_allocatable(vdev_t *vd) { uint64_t state = vd->vdev_state; /* * We currently allow allocations from vdevs which may be in the * process of reopening (i.e. VDEV_STATE_CLOSED). If the device * fails to reopen then we'll catch it later when we're holding * the proper locks. Note that we have to get the vdev state * in a local variable because although it changes atomically, * we're asking two separate questions about it. */ return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) && !vd->vdev_cant_write && vdev_is_concrete(vd) && vd->vdev_mg->mg_initialized); } boolean_t vdev_accessible(vdev_t *vd, zio_t *zio) { ASSERT(zio->io_vd == vd); if (vdev_is_dead(vd) || vd->vdev_remove_wanted) return (B_FALSE); if (zio->io_type == ZIO_TYPE_READ) return (!vd->vdev_cant_read); if (zio->io_type == ZIO_TYPE_WRITE) return (!vd->vdev_cant_write); return (B_TRUE); } static void vdev_get_child_stat(vdev_t *cvd, vdev_stat_t *vs, vdev_stat_t *cvs) { /* * Exclude the dRAID spare when aggregating to avoid double counting * the ops and bytes. These IOs are counted by the physical leaves. */ if (cvd->vdev_ops == &vdev_draid_spare_ops) return; for (int t = 0; t < VS_ZIO_TYPES; t++) { vs->vs_ops[t] += cvs->vs_ops[t]; vs->vs_bytes[t] += cvs->vs_bytes[t]; } cvs->vs_scan_removing = cvd->vdev_removing; } /* * Get extended stats */ static void vdev_get_child_stat_ex(vdev_t *cvd, vdev_stat_ex_t *vsx, vdev_stat_ex_t *cvsx) { (void) cvd; int t, b; for (t = 0; t < ZIO_TYPES; t++) { for (b = 0; b < ARRAY_SIZE(vsx->vsx_disk_histo[0]); b++) vsx->vsx_disk_histo[t][b] += cvsx->vsx_disk_histo[t][b]; for (b = 0; b < ARRAY_SIZE(vsx->vsx_total_histo[0]); b++) { vsx->vsx_total_histo[t][b] += cvsx->vsx_total_histo[t][b]; } } for (t = 0; t < ZIO_PRIORITY_NUM_QUEUEABLE; t++) { for (b = 0; b < ARRAY_SIZE(vsx->vsx_queue_histo[0]); b++) { vsx->vsx_queue_histo[t][b] += cvsx->vsx_queue_histo[t][b]; } vsx->vsx_active_queue[t] += cvsx->vsx_active_queue[t]; vsx->vsx_pend_queue[t] += cvsx->vsx_pend_queue[t]; for (b = 0; b < ARRAY_SIZE(vsx->vsx_ind_histo[0]); b++) vsx->vsx_ind_histo[t][b] += cvsx->vsx_ind_histo[t][b]; for (b = 0; b < ARRAY_SIZE(vsx->vsx_agg_histo[0]); b++) vsx->vsx_agg_histo[t][b] += cvsx->vsx_agg_histo[t][b]; } } boolean_t vdev_is_spacemap_addressable(vdev_t *vd) { if (spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_SPACEMAP_V2)) return (B_TRUE); /* * If double-word space map entries are not enabled we assume * 47 bits of the space map entry are dedicated to the entry's * offset (see SM_OFFSET_BITS in space_map.h). We then use that * to calculate the maximum address that can be described by a * space map entry for the given device. */ uint64_t shift = vd->vdev_ashift + SM_OFFSET_BITS; if (shift >= 63) /* detect potential overflow */ return (B_TRUE); return (vd->vdev_asize < (1ULL << shift)); } /* * Get statistics for the given vdev. */ static void vdev_get_stats_ex_impl(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx) { int t; /* * If we're getting stats on the root vdev, aggregate the I/O counts * over all top-level vdevs (i.e. the direct children of the root). */ if (!vd->vdev_ops->vdev_op_leaf) { if (vs) { memset(vs->vs_ops, 0, sizeof (vs->vs_ops)); memset(vs->vs_bytes, 0, sizeof (vs->vs_bytes)); } if (vsx) memset(vsx, 0, sizeof (*vsx)); for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; vdev_stat_t *cvs = &cvd->vdev_stat; vdev_stat_ex_t *cvsx = &cvd->vdev_stat_ex; vdev_get_stats_ex_impl(cvd, cvs, cvsx); if (vs) vdev_get_child_stat(cvd, vs, cvs); if (vsx) vdev_get_child_stat_ex(cvd, vsx, cvsx); } } else { /* * We're a leaf. Just copy our ZIO active queue stats in. The * other leaf stats are updated in vdev_stat_update(). */ if (!vsx) return; memcpy(vsx, &vd->vdev_stat_ex, sizeof (vd->vdev_stat_ex)); for (t = 0; t < ARRAY_SIZE(vd->vdev_queue.vq_class); t++) { vsx->vsx_active_queue[t] = vd->vdev_queue.vq_class[t].vqc_active; vsx->vsx_pend_queue[t] = avl_numnodes( &vd->vdev_queue.vq_class[t].vqc_queued_tree); } } } void vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx) { vdev_t *tvd = vd->vdev_top; mutex_enter(&vd->vdev_stat_lock); if (vs) { memcpy(vs, &vd->vdev_stat, sizeof (*vs)); vs->vs_timestamp = gethrtime() - vs->vs_timestamp; vs->vs_state = vd->vdev_state; vs->vs_rsize = vdev_get_min_asize(vd); if (vd->vdev_ops->vdev_op_leaf) { vs->vs_pspace = vd->vdev_psize; vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; /* * Report initializing progress. Since we don't * have the initializing locks held, this is only * an estimate (although a fairly accurate one). */ vs->vs_initialize_bytes_done = vd->vdev_initialize_bytes_done; vs->vs_initialize_bytes_est = vd->vdev_initialize_bytes_est; vs->vs_initialize_state = vd->vdev_initialize_state; vs->vs_initialize_action_time = vd->vdev_initialize_action_time; /* * Report manual TRIM progress. Since we don't have * the manual TRIM locks held, this is only an * estimate (although fairly accurate one). */ vs->vs_trim_notsup = !vd->vdev_has_trim; vs->vs_trim_bytes_done = vd->vdev_trim_bytes_done; vs->vs_trim_bytes_est = vd->vdev_trim_bytes_est; vs->vs_trim_state = vd->vdev_trim_state; vs->vs_trim_action_time = vd->vdev_trim_action_time; /* Set when there is a deferred resilver. */ vs->vs_resilver_deferred = vd->vdev_resilver_deferred; } /* * Report expandable space on top-level, non-auxiliary devices * only. The expandable space is reported in terms of metaslab * sized units since that determines how much space the pool * can expand. */ if (vd->vdev_aux == NULL && tvd != NULL) { vs->vs_esize = P2ALIGN( vd->vdev_max_asize - vd->vdev_asize, 1ULL << tvd->vdev_ms_shift); } vs->vs_configured_ashift = vd->vdev_top != NULL ? vd->vdev_top->vdev_ashift : vd->vdev_ashift; vs->vs_logical_ashift = vd->vdev_logical_ashift; if (vd->vdev_physical_ashift <= ASHIFT_MAX) vs->vs_physical_ashift = vd->vdev_physical_ashift; else vs->vs_physical_ashift = 0; /* * Report fragmentation and rebuild progress for top-level, * non-auxiliary, concrete devices. */ if (vd->vdev_aux == NULL && vd == vd->vdev_top && vdev_is_concrete(vd)) { /* * The vdev fragmentation rating doesn't take into * account the embedded slog metaslab (vdev_log_mg). * Since it's only one metaslab, it would have a tiny * impact on the overall fragmentation. */ vs->vs_fragmentation = (vd->vdev_mg != NULL) ? vd->vdev_mg->mg_fragmentation : 0; } vs->vs_noalloc = MAX(vd->vdev_noalloc, tvd ? tvd->vdev_noalloc : 0); } vdev_get_stats_ex_impl(vd, vs, vsx); mutex_exit(&vd->vdev_stat_lock); } void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs) { return (vdev_get_stats_ex(vd, vs, NULL)); } void vdev_clear_stats(vdev_t *vd) { mutex_enter(&vd->vdev_stat_lock); vd->vdev_stat.vs_space = 0; vd->vdev_stat.vs_dspace = 0; vd->vdev_stat.vs_alloc = 0; mutex_exit(&vd->vdev_stat_lock); } void vdev_scan_stat_init(vdev_t *vd) { vdev_stat_t *vs = &vd->vdev_stat; for (int c = 0; c < vd->vdev_children; c++) vdev_scan_stat_init(vd->vdev_child[c]); mutex_enter(&vd->vdev_stat_lock); vs->vs_scan_processed = 0; mutex_exit(&vd->vdev_stat_lock); } void vdev_stat_update(zio_t *zio, uint64_t psize) { spa_t *spa = zio->io_spa; vdev_t *rvd = spa->spa_root_vdev; vdev_t *vd = zio->io_vd ? zio->io_vd : rvd; vdev_t *pvd; uint64_t txg = zio->io_txg; vdev_stat_t *vs = vd ? &vd->vdev_stat : NULL; vdev_stat_ex_t *vsx = vd ? &vd->vdev_stat_ex : NULL; zio_type_t type = zio->io_type; int flags = zio->io_flags; /* * If this i/o is a gang leader, it didn't do any actual work. */ if (zio->io_gang_tree) return; if (zio->io_error == 0) { /* * If this is a root i/o, don't count it -- we've already * counted the top-level vdevs, and vdev_get_stats() will * aggregate them when asked. This reduces contention on * the root vdev_stat_lock and implicitly handles blocks * that compress away to holes, for which there is no i/o. * (Holes never create vdev children, so all the counters * remain zero, which is what we want.) * * Note: this only applies to successful i/o (io_error == 0) * because unlike i/o counts, errors are not additive. * When reading a ditto block, for example, failure of * one top-level vdev does not imply a root-level error. */ if (vd == rvd) return; ASSERT(vd == zio->io_vd); if (flags & ZIO_FLAG_IO_BYPASS) return; mutex_enter(&vd->vdev_stat_lock); if (flags & ZIO_FLAG_IO_REPAIR) { /* * Repair is the result of a resilver issued by the * scan thread (spa_sync). */ if (flags & ZIO_FLAG_SCAN_THREAD) { dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; dsl_scan_phys_t *scn_phys = &scn->scn_phys; uint64_t *processed = &scn_phys->scn_processed; if (vd->vdev_ops->vdev_op_leaf) atomic_add_64(processed, psize); vs->vs_scan_processed += psize; } /* * Repair is the result of a rebuild issued by the * rebuild thread (vdev_rebuild_thread). To avoid * double counting repaired bytes the virtual dRAID * spare vdev is excluded from the processed bytes. */ if (zio->io_priority == ZIO_PRIORITY_REBUILD) { vdev_t *tvd = vd->vdev_top; vdev_rebuild_t *vr = &tvd->vdev_rebuild_config; vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; uint64_t *rebuilt = &vrp->vrp_bytes_rebuilt; if (vd->vdev_ops->vdev_op_leaf && vd->vdev_ops != &vdev_draid_spare_ops) { atomic_add_64(rebuilt, psize); } vs->vs_rebuild_processed += psize; } if (flags & ZIO_FLAG_SELF_HEAL) vs->vs_self_healed += psize; } /* * The bytes/ops/histograms are recorded at the leaf level and * aggregated into the higher level vdevs in vdev_get_stats(). */ if (vd->vdev_ops->vdev_op_leaf && (zio->io_priority < ZIO_PRIORITY_NUM_QUEUEABLE)) { zio_type_t vs_type = type; zio_priority_t priority = zio->io_priority; /* * TRIM ops and bytes are reported to user space as * ZIO_TYPE_IOCTL. This is done to preserve the * vdev_stat_t structure layout for user space. */ if (type == ZIO_TYPE_TRIM) vs_type = ZIO_TYPE_IOCTL; /* * Solely for the purposes of 'zpool iostat -lqrw' * reporting use the priority to categorize the IO. * Only the following are reported to user space: * * ZIO_PRIORITY_SYNC_READ, * ZIO_PRIORITY_SYNC_WRITE, * ZIO_PRIORITY_ASYNC_READ, * ZIO_PRIORITY_ASYNC_WRITE, * ZIO_PRIORITY_SCRUB, * ZIO_PRIORITY_TRIM, * ZIO_PRIORITY_REBUILD. */ if (priority == ZIO_PRIORITY_INITIALIZING) { ASSERT3U(type, ==, ZIO_TYPE_WRITE); priority = ZIO_PRIORITY_ASYNC_WRITE; } else if (priority == ZIO_PRIORITY_REMOVAL) { priority = ((type == ZIO_TYPE_WRITE) ? ZIO_PRIORITY_ASYNC_WRITE : ZIO_PRIORITY_ASYNC_READ); } vs->vs_ops[vs_type]++; vs->vs_bytes[vs_type] += psize; if (flags & ZIO_FLAG_DELEGATED) { vsx->vsx_agg_histo[priority] [RQ_HISTO(zio->io_size)]++; } else { vsx->vsx_ind_histo[priority] [RQ_HISTO(zio->io_size)]++; } if (zio->io_delta && zio->io_delay) { vsx->vsx_queue_histo[priority] [L_HISTO(zio->io_delta - zio->io_delay)]++; vsx->vsx_disk_histo[type] [L_HISTO(zio->io_delay)]++; vsx->vsx_total_histo[type] [L_HISTO(zio->io_delta)]++; } } mutex_exit(&vd->vdev_stat_lock); return; } if (flags & ZIO_FLAG_SPECULATIVE) return; /* * If this is an I/O error that is going to be retried, then ignore the * error. Otherwise, the user may interpret B_FAILFAST I/O errors as * hard errors, when in reality they can happen for any number of * innocuous reasons (bus resets, MPxIO link failure, etc). */ if (zio->io_error == EIO && !(zio->io_flags & ZIO_FLAG_IO_RETRY)) return; /* * Intent logs writes won't propagate their error to the root * I/O so don't mark these types of failures as pool-level * errors. */ if (zio->io_vd == NULL && (zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) return; if (type == ZIO_TYPE_WRITE && txg != 0 && (!(flags & ZIO_FLAG_IO_REPAIR) || (flags & ZIO_FLAG_SCAN_THREAD) || spa->spa_claiming)) { /* * This is either a normal write (not a repair), or it's * a repair induced by the scrub thread, or it's a repair * made by zil_claim() during spa_load() in the first txg. * In the normal case, we commit the DTL change in the same * txg as the block was born. In the scrub-induced repair * case, we know that scrubs run in first-pass syncing context, * so we commit the DTL change in spa_syncing_txg(spa). * In the zil_claim() case, we commit in spa_first_txg(spa). * * We currently do not make DTL entries for failed spontaneous * self-healing writes triggered by normal (non-scrubbing) * reads, because we have no transactional context in which to * do so -- and it's not clear that it'd be desirable anyway. */ if (vd->vdev_ops->vdev_op_leaf) { uint64_t commit_txg = txg; if (flags & ZIO_FLAG_SCAN_THREAD) { ASSERT(flags & ZIO_FLAG_IO_REPAIR); ASSERT(spa_sync_pass(spa) == 1); vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1); commit_txg = spa_syncing_txg(spa); } else if (spa->spa_claiming) { ASSERT(flags & ZIO_FLAG_IO_REPAIR); commit_txg = spa_first_txg(spa); } ASSERT(commit_txg >= spa_syncing_txg(spa)); if (vdev_dtl_contains(vd, DTL_MISSING, txg, 1)) return; for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) vdev_dtl_dirty(pvd, DTL_PARTIAL, txg, 1); vdev_dirty(vd->vdev_top, VDD_DTL, vd, commit_txg); } if (vd != rvd) vdev_dtl_dirty(vd, DTL_MISSING, txg, 1); } } int64_t vdev_deflated_space(vdev_t *vd, int64_t space) { ASSERT((space & (SPA_MINBLOCKSIZE-1)) == 0); ASSERT(vd->vdev_deflate_ratio != 0 || vd->vdev_isl2cache); return ((space >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio); } /* * Update the in-core space usage stats for this vdev, its metaslab class, * and the root vdev. */ void vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta, int64_t space_delta) { (void) defer_delta; int64_t dspace_delta; spa_t *spa = vd->vdev_spa; vdev_t *rvd = spa->spa_root_vdev; ASSERT(vd == vd->vdev_top); /* * Apply the inverse of the psize-to-asize (ie. RAID-Z) space-expansion * factor. We must calculate this here and not at the root vdev * because the root vdev's psize-to-asize is simply the max of its * children's, thus not accurate enough for us. */ dspace_delta = vdev_deflated_space(vd, space_delta); mutex_enter(&vd->vdev_stat_lock); /* ensure we won't underflow */ if (alloc_delta < 0) { ASSERT3U(vd->vdev_stat.vs_alloc, >=, -alloc_delta); } vd->vdev_stat.vs_alloc += alloc_delta; vd->vdev_stat.vs_space += space_delta; vd->vdev_stat.vs_dspace += dspace_delta; mutex_exit(&vd->vdev_stat_lock); /* every class but log contributes to root space stats */ if (vd->vdev_mg != NULL && !vd->vdev_islog) { ASSERT(!vd->vdev_isl2cache); mutex_enter(&rvd->vdev_stat_lock); rvd->vdev_stat.vs_alloc += alloc_delta; rvd->vdev_stat.vs_space += space_delta; rvd->vdev_stat.vs_dspace += dspace_delta; mutex_exit(&rvd->vdev_stat_lock); } /* Note: metaslab_class_space_update moved to metaslab_space_update */ } /* * Mark a top-level vdev's config as dirty, placing it on the dirty list * so that it will be written out next time the vdev configuration is synced. * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs. */ void vdev_config_dirty(vdev_t *vd) { spa_t *spa = vd->vdev_spa; vdev_t *rvd = spa->spa_root_vdev; int c; ASSERT(spa_writeable(spa)); /* * If this is an aux vdev (as with l2cache and spare devices), then we * update the vdev config manually and set the sync flag. */ if (vd->vdev_aux != NULL) { spa_aux_vdev_t *sav = vd->vdev_aux; nvlist_t **aux; uint_t naux; for (c = 0; c < sav->sav_count; c++) { if (sav->sav_vdevs[c] == vd) break; } if (c == sav->sav_count) { /* * We're being removed. There's nothing more to do. */ ASSERT(sav->sav_sync == B_TRUE); return; } sav->sav_sync = B_TRUE; if (nvlist_lookup_nvlist_array(sav->sav_config, ZPOOL_CONFIG_L2CACHE, &aux, &naux) != 0) { VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, ZPOOL_CONFIG_SPARES, &aux, &naux) == 0); } ASSERT(c < naux); /* * Setting the nvlist in the middle if the array is a little * sketchy, but it will work. */ nvlist_free(aux[c]); aux[c] = vdev_config_generate(spa, vd, B_TRUE, 0); return; } /* * The dirty list is protected by the SCL_CONFIG lock. The caller * must either hold SCL_CONFIG as writer, or must be the sync thread * (which holds SCL_CONFIG as reader). There's only one sync thread, * so this is sufficient to ensure mutual exclusion. */ ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) || (dsl_pool_sync_context(spa_get_dsl(spa)) && spa_config_held(spa, SCL_CONFIG, RW_READER))); if (vd == rvd) { for (c = 0; c < rvd->vdev_children; c++) vdev_config_dirty(rvd->vdev_child[c]); } else { ASSERT(vd == vd->vdev_top); if (!list_link_active(&vd->vdev_config_dirty_node) && vdev_is_concrete(vd)) { list_insert_head(&spa->spa_config_dirty_list, vd); } } } void vdev_config_clean(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) || (dsl_pool_sync_context(spa_get_dsl(spa)) && spa_config_held(spa, SCL_CONFIG, RW_READER))); ASSERT(list_link_active(&vd->vdev_config_dirty_node)); list_remove(&spa->spa_config_dirty_list, vd); } /* * Mark a top-level vdev's state as dirty, so that the next pass of * spa_sync() can convert this into vdev_config_dirty(). We distinguish * the state changes from larger config changes because they require * much less locking, and are often needed for administrative actions. */ void vdev_state_dirty(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(spa_writeable(spa)); ASSERT(vd == vd->vdev_top); /* * The state list is protected by the SCL_STATE lock. The caller * must either hold SCL_STATE as writer, or must be the sync thread * (which holds SCL_STATE as reader). There's only one sync thread, * so this is sufficient to ensure mutual exclusion. */ ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) || (dsl_pool_sync_context(spa_get_dsl(spa)) && spa_config_held(spa, SCL_STATE, RW_READER))); if (!list_link_active(&vd->vdev_state_dirty_node) && vdev_is_concrete(vd)) list_insert_head(&spa->spa_state_dirty_list, vd); } void vdev_state_clean(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) || (dsl_pool_sync_context(spa_get_dsl(spa)) && spa_config_held(spa, SCL_STATE, RW_READER))); ASSERT(list_link_active(&vd->vdev_state_dirty_node)); list_remove(&spa->spa_state_dirty_list, vd); } /* * Propagate vdev state up from children to parent. */ void vdev_propagate_state(vdev_t *vd) { spa_t *spa = vd->vdev_spa; vdev_t *rvd = spa->spa_root_vdev; int degraded = 0, faulted = 0; int corrupted = 0; vdev_t *child; if (vd->vdev_children > 0) { for (int c = 0; c < vd->vdev_children; c++) { child = vd->vdev_child[c]; /* * Don't factor holes or indirect vdevs into the * decision. */ if (!vdev_is_concrete(child)) continue; if (!vdev_readable(child) || (!vdev_writeable(child) && spa_writeable(spa))) { /* * Root special: if there is a top-level log * device, treat the root vdev as if it were * degraded. */ if (child->vdev_islog && vd == rvd) degraded++; else faulted++; } else if (child->vdev_state <= VDEV_STATE_DEGRADED) { degraded++; } if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA) corrupted++; } vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded); /* * Root special: if there is a top-level vdev that cannot be * opened due to corrupted metadata, then propagate the root * vdev's aux state as 'corrupt' rather than 'insufficient * replicas'. */ if (corrupted && vd == rvd && rvd->vdev_state == VDEV_STATE_CANT_OPEN) vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); } if (vd->vdev_parent) vdev_propagate_state(vd->vdev_parent); } /* * Set a vdev's state. If this is during an open, we don't update the parent * state, because we're in the process of opening children depth-first. * Otherwise, we propagate the change to the parent. * * If this routine places a device in a faulted state, an appropriate ereport is * generated. */ void vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) { uint64_t save_state; spa_t *spa = vd->vdev_spa; if (state == vd->vdev_state) { /* * Since vdev_offline() code path is already in an offline * state we can miss a statechange event to OFFLINE. Check * the previous state to catch this condition. */ if (vd->vdev_ops->vdev_op_leaf && (state == VDEV_STATE_OFFLINE) && (vd->vdev_prevstate >= VDEV_STATE_FAULTED)) { /* post an offline state change */ zfs_post_state_change(spa, vd, vd->vdev_prevstate); } vd->vdev_stat.vs_aux = aux; return; } save_state = vd->vdev_state; vd->vdev_state = state; vd->vdev_stat.vs_aux = aux; /* * If we are setting the vdev state to anything but an open state, then * always close the underlying device unless the device has requested * a delayed close (i.e. we're about to remove or fault the device). * Otherwise, we keep accessible but invalid devices open forever. * We don't call vdev_close() itself, because that implies some extra * checks (offline, etc) that we don't want here. This is limited to * leaf devices, because otherwise closing the device will affect other * children. */ if (!vd->vdev_delayed_close && vdev_is_dead(vd) && vd->vdev_ops->vdev_op_leaf) vd->vdev_ops->vdev_op_close(vd); if (vd->vdev_removed && state == VDEV_STATE_CANT_OPEN && (aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) { /* * If the previous state is set to VDEV_STATE_REMOVED, then this * device was previously marked removed and someone attempted to * reopen it. If this failed due to a nonexistent device, then * keep the device in the REMOVED state. We also let this be if * it is one of our special test online cases, which is only * attempting to online the device and shouldn't generate an FMA * fault. */ vd->vdev_state = VDEV_STATE_REMOVED; vd->vdev_stat.vs_aux = VDEV_AUX_NONE; } else if (state == VDEV_STATE_REMOVED) { vd->vdev_removed = B_TRUE; } else if (state == VDEV_STATE_CANT_OPEN) { /* * If we fail to open a vdev during an import or recovery, we * mark it as "not available", which signifies that it was * never there to begin with. Failure to open such a device * is not considered an error. */ if ((spa_load_state(spa) == SPA_LOAD_IMPORT || spa_load_state(spa) == SPA_LOAD_RECOVER) && vd->vdev_ops->vdev_op_leaf) vd->vdev_not_present = 1; /* * Post the appropriate ereport. If the 'prevstate' field is * set to something other than VDEV_STATE_UNKNOWN, it indicates * that this is part of a vdev_reopen(). In this case, we don't * want to post the ereport if the device was already in the * CANT_OPEN state beforehand. * * If the 'checkremove' flag is set, then this is an attempt to * online the device in response to an insertion event. If we * hit this case, then we have detected an insertion event for a * faulted or offline device that wasn't in the removed state. * In this scenario, we don't post an ereport because we are * about to replace the device, or attempt an online with * vdev_forcefault, which will generate the fault for us. */ if ((vd->vdev_prevstate != state || vd->vdev_forcefault) && !vd->vdev_not_present && !vd->vdev_checkremove && vd != spa->spa_root_vdev) { const char *class; switch (aux) { case VDEV_AUX_OPEN_FAILED: class = FM_EREPORT_ZFS_DEVICE_OPEN_FAILED; break; case VDEV_AUX_CORRUPT_DATA: class = FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA; break; case VDEV_AUX_NO_REPLICAS: class = FM_EREPORT_ZFS_DEVICE_NO_REPLICAS; break; case VDEV_AUX_BAD_GUID_SUM: class = FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM; break; case VDEV_AUX_TOO_SMALL: class = FM_EREPORT_ZFS_DEVICE_TOO_SMALL; break; case VDEV_AUX_BAD_LABEL: class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL; break; case VDEV_AUX_BAD_ASHIFT: class = FM_EREPORT_ZFS_DEVICE_BAD_ASHIFT; break; default: class = FM_EREPORT_ZFS_DEVICE_UNKNOWN; } (void) zfs_ereport_post(class, spa, vd, NULL, NULL, save_state); } /* Erase any notion of persistent removed state */ vd->vdev_removed = B_FALSE; } else { vd->vdev_removed = B_FALSE; } /* * Notify ZED of any significant state-change on a leaf vdev. * */ if (vd->vdev_ops->vdev_op_leaf) { /* preserve original state from a vdev_reopen() */ if ((vd->vdev_prevstate != VDEV_STATE_UNKNOWN) && (vd->vdev_prevstate != vd->vdev_state) && (save_state <= VDEV_STATE_CLOSED)) save_state = vd->vdev_prevstate; /* filter out state change due to initial vdev_open */ if (save_state > VDEV_STATE_CLOSED) zfs_post_state_change(spa, vd, save_state); } if (!isopen && vd->vdev_parent) vdev_propagate_state(vd->vdev_parent); } boolean_t vdev_children_are_offline(vdev_t *vd) { ASSERT(!vd->vdev_ops->vdev_op_leaf); for (uint64_t i = 0; i < vd->vdev_children; i++) { if (vd->vdev_child[i]->vdev_state != VDEV_STATE_OFFLINE) return (B_FALSE); } return (B_TRUE); } /* * Check the vdev configuration to ensure that it's capable of supporting * a root pool. We do not support partial configuration. */ boolean_t vdev_is_bootable(vdev_t *vd) { if (!vd->vdev_ops->vdev_op_leaf) { const char *vdev_type = vd->vdev_ops->vdev_op_type; if (strcmp(vdev_type, VDEV_TYPE_MISSING) == 0) return (B_FALSE); } for (int c = 0; c < vd->vdev_children; c++) { if (!vdev_is_bootable(vd->vdev_child[c])) return (B_FALSE); } return (B_TRUE); } boolean_t vdev_is_concrete(vdev_t *vd) { vdev_ops_t *ops = vd->vdev_ops; if (ops == &vdev_indirect_ops || ops == &vdev_hole_ops || ops == &vdev_missing_ops || ops == &vdev_root_ops) { return (B_FALSE); } else { return (B_TRUE); } } /* * Determine if a log device has valid content. If the vdev was * removed or faulted in the MOS config then we know that * the content on the log device has already been written to the pool. */ boolean_t vdev_log_state_valid(vdev_t *vd) { if (vd->vdev_ops->vdev_op_leaf && !vd->vdev_faulted && !vd->vdev_removed) return (B_TRUE); for (int c = 0; c < vd->vdev_children; c++) if (vdev_log_state_valid(vd->vdev_child[c])) return (B_TRUE); return (B_FALSE); } /* * Expand a vdev if possible. */ void vdev_expand(vdev_t *vd, uint64_t txg) { ASSERT(vd->vdev_top == vd); ASSERT(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); ASSERT(vdev_is_concrete(vd)); vdev_set_deflate_ratio(vd); if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count && vdev_is_concrete(vd)) { vdev_metaslab_group_create(vd); VERIFY(vdev_metaslab_init(vd, txg) == 0); vdev_config_dirty(vd); } } /* * Split a vdev. */ void vdev_split(vdev_t *vd) { vdev_t *cvd, *pvd = vd->vdev_parent; vdev_remove_child(pvd, vd); vdev_compact_children(pvd); cvd = pvd->vdev_child[0]; if (pvd->vdev_children == 1) { vdev_remove_parent(cvd); cvd->vdev_splitting = B_TRUE; } vdev_propagate_state(cvd); } void vdev_deadman(vdev_t *vd, const char *tag) { for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; vdev_deadman(cvd, tag); } if (vd->vdev_ops->vdev_op_leaf) { vdev_queue_t *vq = &vd->vdev_queue; mutex_enter(&vq->vq_lock); if (avl_numnodes(&vq->vq_active_tree) > 0) { spa_t *spa = vd->vdev_spa; zio_t *fio; uint64_t delta; zfs_dbgmsg("slow vdev: %s has %lu active IOs", vd->vdev_path, avl_numnodes(&vq->vq_active_tree)); /* * Look at the head of all the pending queues, * if any I/O has been outstanding for longer than * the spa_deadman_synctime invoke the deadman logic. */ fio = avl_first(&vq->vq_active_tree); delta = gethrtime() - fio->io_timestamp; if (delta > spa_deadman_synctime(spa)) zio_deadman(fio, tag); } mutex_exit(&vq->vq_lock); } } void vdev_defer_resilver(vdev_t *vd) { ASSERT(vd->vdev_ops->vdev_op_leaf); vd->vdev_resilver_deferred = B_TRUE; vd->vdev_spa->spa_resilver_deferred = B_TRUE; } /* * Clears the resilver deferred flag on all leaf devs under vd. Returns * B_TRUE if we have devices that need to be resilvered and are available to * accept resilver I/Os. */ boolean_t vdev_clear_resilver_deferred(vdev_t *vd, dmu_tx_t *tx) { boolean_t resilver_needed = B_FALSE; spa_t *spa = vd->vdev_spa; for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; resilver_needed |= vdev_clear_resilver_deferred(cvd, tx); } if (vd == spa->spa_root_vdev && spa_feature_is_active(spa, SPA_FEATURE_RESILVER_DEFER)) { spa_feature_decr(spa, SPA_FEATURE_RESILVER_DEFER, tx); vdev_config_dirty(vd); spa->spa_resilver_deferred = B_FALSE; return (resilver_needed); } if (!vdev_is_concrete(vd) || vd->vdev_aux || !vd->vdev_ops->vdev_op_leaf) return (resilver_needed); vd->vdev_resilver_deferred = B_FALSE; return (!vdev_is_dead(vd) && !vd->vdev_offline && vdev_resilver_needed(vd, NULL, NULL)); } boolean_t vdev_xlate_is_empty(range_seg64_t *rs) { return (rs->rs_start == rs->rs_end); } /* * Translate a logical range to the first contiguous physical range for the * specified vdev_t. This function is initially called with a leaf vdev and * will walk each parent vdev until it reaches a top-level vdev. Once the * top-level is reached the physical range is initialized and the recursive * function begins to unwind. As it unwinds it calls the parent's vdev * specific translation function to do the real conversion. */ void vdev_xlate(vdev_t *vd, const range_seg64_t *logical_rs, range_seg64_t *physical_rs, range_seg64_t *remain_rs) { /* * Walk up the vdev tree */ if (vd != vd->vdev_top) { vdev_xlate(vd->vdev_parent, logical_rs, physical_rs, remain_rs); } else { /* * We've reached the top-level vdev, initialize the physical * range to the logical range and set an empty remaining * range then start to unwind. */ physical_rs->rs_start = logical_rs->rs_start; physical_rs->rs_end = logical_rs->rs_end; remain_rs->rs_start = logical_rs->rs_start; remain_rs->rs_end = logical_rs->rs_start; return; } vdev_t *pvd = vd->vdev_parent; ASSERT3P(pvd, !=, NULL); ASSERT3P(pvd->vdev_ops->vdev_op_xlate, !=, NULL); /* * As this recursive function unwinds, translate the logical * range into its physical and any remaining components by calling * the vdev specific translate function. */ range_seg64_t intermediate = { 0 }; pvd->vdev_ops->vdev_op_xlate(vd, physical_rs, &intermediate, remain_rs); physical_rs->rs_start = intermediate.rs_start; physical_rs->rs_end = intermediate.rs_end; } void vdev_xlate_walk(vdev_t *vd, const range_seg64_t *logical_rs, vdev_xlate_func_t *func, void *arg) { range_seg64_t iter_rs = *logical_rs; range_seg64_t physical_rs; range_seg64_t remain_rs; while (!vdev_xlate_is_empty(&iter_rs)) { vdev_xlate(vd, &iter_rs, &physical_rs, &remain_rs); /* * With raidz and dRAID, it's possible that the logical range * does not live on this leaf vdev. Only when there is a non- * zero physical size call the provided function. */ if (!vdev_xlate_is_empty(&physical_rs)) func(arg, &physical_rs); iter_rs = remain_rs; } } static char * vdev_name(vdev_t *vd, char *buf, int buflen) { if (vd->vdev_path == NULL) { if (strcmp(vd->vdev_ops->vdev_op_type, "root") == 0) { strlcpy(buf, vd->vdev_spa->spa_name, buflen); } else if (!vd->vdev_ops->vdev_op_leaf) { snprintf(buf, buflen, "%s-%llu", vd->vdev_ops->vdev_op_type, (u_longlong_t)vd->vdev_id); } } else { strlcpy(buf, vd->vdev_path, buflen); } return (buf); } /* * Look at the vdev tree and determine whether any devices are currently being * replaced. */ boolean_t vdev_replace_in_progress(vdev_t *vdev) { ASSERT(spa_config_held(vdev->vdev_spa, SCL_ALL, RW_READER) != 0); if (vdev->vdev_ops == &vdev_replacing_ops) return (B_TRUE); /* * A 'spare' vdev indicates that we have a replace in progress, unless * it has exactly two children, and the second, the hot spare, has * finished being resilvered. */ if (vdev->vdev_ops == &vdev_spare_ops && (vdev->vdev_children > 2 || !vdev_dtl_empty(vdev->vdev_child[1], DTL_MISSING))) return (B_TRUE); for (int i = 0; i < vdev->vdev_children; i++) { if (vdev_replace_in_progress(vdev->vdev_child[i])) return (B_TRUE); } return (B_FALSE); } /* * Add a (source=src, propname=propval) list to an nvlist. */ static void vdev_prop_add_list(nvlist_t *nvl, const char *propname, char *strval, uint64_t intval, zprop_source_t src) { nvlist_t *propval; propval = fnvlist_alloc(); fnvlist_add_uint64(propval, ZPROP_SOURCE, src); if (strval != NULL) fnvlist_add_string(propval, ZPROP_VALUE, strval); else fnvlist_add_uint64(propval, ZPROP_VALUE, intval); fnvlist_add_nvlist(nvl, propname, propval); nvlist_free(propval); } static void vdev_props_set_sync(void *arg, dmu_tx_t *tx) { vdev_t *vd; nvlist_t *nvp = arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; objset_t *mos = spa->spa_meta_objset; nvpair_t *elem = NULL; uint64_t vdev_guid; nvlist_t *nvprops; vdev_guid = fnvlist_lookup_uint64(nvp, ZPOOL_VDEV_PROPS_SET_VDEV); nvprops = fnvlist_lookup_nvlist(nvp, ZPOOL_VDEV_PROPS_SET_PROPS); vd = spa_lookup_by_guid(spa, vdev_guid, B_TRUE); /* this vdev could get removed while waiting for this sync task */ if (vd == NULL) return; mutex_enter(&spa->spa_props_lock); while ((elem = nvlist_next_nvpair(nvprops, elem)) != NULL) { uint64_t intval, objid = 0; char *strval; vdev_prop_t prop; const char *propname = nvpair_name(elem); zprop_type_t proptype; /* * Set vdev property values in the vdev props mos object. */ if (vd->vdev_top_zap != 0) { objid = vd->vdev_top_zap; } else if (vd->vdev_leaf_zap != 0) { objid = vd->vdev_leaf_zap; } else { panic("vdev not top or leaf"); } switch (prop = vdev_name_to_prop(propname)) { case VDEV_PROP_USERPROP: if (vdev_prop_user(propname)) { strval = fnvpair_value_string(elem); if (strlen(strval) == 0) { /* remove the property if value == "" */ (void) zap_remove(mos, objid, propname, tx); } else { VERIFY0(zap_update(mos, objid, propname, 1, strlen(strval) + 1, strval, tx)); } spa_history_log_internal(spa, "vdev set", tx, "vdev_guid=%llu: %s=%s", (u_longlong_t)vdev_guid, nvpair_name(elem), strval); } break; default: /* normalize the property name */ propname = vdev_prop_to_name(prop); proptype = vdev_prop_get_type(prop); if (nvpair_type(elem) == DATA_TYPE_STRING) { ASSERT(proptype == PROP_TYPE_STRING); strval = fnvpair_value_string(elem); VERIFY0(zap_update(mos, objid, propname, 1, strlen(strval) + 1, strval, tx)); spa_history_log_internal(spa, "vdev set", tx, "vdev_guid=%llu: %s=%s", (u_longlong_t)vdev_guid, nvpair_name(elem), strval); } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { intval = fnvpair_value_uint64(elem); if (proptype == PROP_TYPE_INDEX) { const char *unused; VERIFY0(vdev_prop_index_to_string( prop, intval, &unused)); } VERIFY0(zap_update(mos, objid, propname, sizeof (uint64_t), 1, &intval, tx)); spa_history_log_internal(spa, "vdev set", tx, "vdev_guid=%llu: %s=%lld", (u_longlong_t)vdev_guid, nvpair_name(elem), (longlong_t)intval); } else { panic("invalid vdev property type %u", nvpair_type(elem)); } } } mutex_exit(&spa->spa_props_lock); } int vdev_prop_set(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl) { spa_t *spa = vd->vdev_spa; nvpair_t *elem = NULL; uint64_t vdev_guid; nvlist_t *nvprops; int error = 0; ASSERT(vd != NULL); if (nvlist_lookup_uint64(innvl, ZPOOL_VDEV_PROPS_SET_VDEV, &vdev_guid) != 0) return (SET_ERROR(EINVAL)); if (nvlist_lookup_nvlist(innvl, ZPOOL_VDEV_PROPS_SET_PROPS, &nvprops) != 0) return (SET_ERROR(EINVAL)); if ((vd = spa_lookup_by_guid(spa, vdev_guid, B_TRUE)) == NULL) return (SET_ERROR(EINVAL)); while ((elem = nvlist_next_nvpair(nvprops, elem)) != NULL) { char *propname = nvpair_name(elem); vdev_prop_t prop = vdev_name_to_prop(propname); uint64_t intval = 0; char *strval = NULL; if (prop == VDEV_PROP_USERPROP && !vdev_prop_user(propname)) { error = EINVAL; goto end; } if (vdev_prop_readonly(prop)) { error = EROFS; goto end; } /* Special Processing */ switch (prop) { case VDEV_PROP_PATH: if (vd->vdev_path == NULL) { error = EROFS; break; } if (nvpair_value_string(elem, &strval) != 0) { error = EINVAL; break; } /* New path must start with /dev/ */ if (strncmp(strval, "/dev/", 5)) { error = EINVAL; break; } error = spa_vdev_setpath(spa, vdev_guid, strval); break; case VDEV_PROP_ALLOCATING: if (nvpair_value_uint64(elem, &intval) != 0) { error = EINVAL; break; } if (intval != vd->vdev_noalloc) break; if (intval == 0) error = spa_vdev_noalloc(spa, vdev_guid); else error = spa_vdev_alloc(spa, vdev_guid); break; default: /* Most processing is done in vdev_props_set_sync */ break; } end: if (error != 0) { intval = error; vdev_prop_add_list(outnvl, propname, strval, intval, 0); return (error); } } return (dsl_sync_task(spa->spa_name, NULL, vdev_props_set_sync, innvl, 6, ZFS_SPACE_CHECK_EXTRA_RESERVED)); } int vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl) { spa_t *spa = vd->vdev_spa; objset_t *mos = spa->spa_meta_objset; int err = 0; uint64_t objid; uint64_t vdev_guid; nvpair_t *elem = NULL; nvlist_t *nvprops = NULL; uint64_t intval = 0; char *strval = NULL; const char *propname = NULL; vdev_prop_t prop; ASSERT(vd != NULL); ASSERT(mos != NULL); if (nvlist_lookup_uint64(innvl, ZPOOL_VDEV_PROPS_GET_VDEV, &vdev_guid) != 0) return (SET_ERROR(EINVAL)); nvlist_lookup_nvlist(innvl, ZPOOL_VDEV_PROPS_GET_PROPS, &nvprops); if (vd->vdev_top_zap != 0) { objid = vd->vdev_top_zap; } else if (vd->vdev_leaf_zap != 0) { objid = vd->vdev_leaf_zap; } else { return (SET_ERROR(EINVAL)); } ASSERT(objid != 0); mutex_enter(&spa->spa_props_lock); if (nvprops != NULL) { char namebuf[64] = { 0 }; while ((elem = nvlist_next_nvpair(nvprops, elem)) != NULL) { intval = 0; strval = NULL; propname = nvpair_name(elem); prop = vdev_name_to_prop(propname); zprop_source_t src = ZPROP_SRC_DEFAULT; uint64_t integer_size, num_integers; switch (prop) { /* Special Read-only Properties */ case VDEV_PROP_NAME: strval = vdev_name(vd, namebuf, sizeof (namebuf)); if (strval == NULL) continue; vdev_prop_add_list(outnvl, propname, strval, 0, ZPROP_SRC_NONE); continue; case VDEV_PROP_CAPACITY: /* percent used */ intval = (vd->vdev_stat.vs_dspace == 0) ? 0 : (vd->vdev_stat.vs_alloc * 100 / vd->vdev_stat.vs_dspace); vdev_prop_add_list(outnvl, propname, NULL, intval, ZPROP_SRC_NONE); continue; case VDEV_PROP_STATE: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_state, ZPROP_SRC_NONE); continue; case VDEV_PROP_GUID: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_guid, ZPROP_SRC_NONE); continue; case VDEV_PROP_ASIZE: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_asize, ZPROP_SRC_NONE); continue; case VDEV_PROP_PSIZE: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_psize, ZPROP_SRC_NONE); continue; case VDEV_PROP_ASHIFT: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_ashift, ZPROP_SRC_NONE); continue; case VDEV_PROP_SIZE: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_dspace, ZPROP_SRC_NONE); continue; case VDEV_PROP_FREE: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_dspace - vd->vdev_stat.vs_alloc, ZPROP_SRC_NONE); continue; case VDEV_PROP_ALLOCATED: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_alloc, ZPROP_SRC_NONE); continue; case VDEV_PROP_EXPANDSZ: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_esize, ZPROP_SRC_NONE); continue; case VDEV_PROP_FRAGMENTATION: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_fragmentation, ZPROP_SRC_NONE); continue; case VDEV_PROP_PARITY: vdev_prop_add_list(outnvl, propname, NULL, vdev_get_nparity(vd), ZPROP_SRC_NONE); continue; case VDEV_PROP_PATH: if (vd->vdev_path == NULL) continue; vdev_prop_add_list(outnvl, propname, vd->vdev_path, 0, ZPROP_SRC_NONE); continue; case VDEV_PROP_DEVID: if (vd->vdev_devid == NULL) continue; vdev_prop_add_list(outnvl, propname, vd->vdev_devid, 0, ZPROP_SRC_NONE); continue; case VDEV_PROP_PHYS_PATH: if (vd->vdev_physpath == NULL) continue; vdev_prop_add_list(outnvl, propname, vd->vdev_physpath, 0, ZPROP_SRC_NONE); continue; case VDEV_PROP_ENC_PATH: if (vd->vdev_enc_sysfs_path == NULL) continue; vdev_prop_add_list(outnvl, propname, vd->vdev_enc_sysfs_path, 0, ZPROP_SRC_NONE); continue; case VDEV_PROP_FRU: if (vd->vdev_fru == NULL) continue; vdev_prop_add_list(outnvl, propname, vd->vdev_fru, 0, ZPROP_SRC_NONE); continue; case VDEV_PROP_PARENT: if (vd->vdev_parent != NULL) { strval = vdev_name(vd->vdev_parent, namebuf, sizeof (namebuf)); vdev_prop_add_list(outnvl, propname, strval, 0, ZPROP_SRC_NONE); } continue; case VDEV_PROP_CHILDREN: if (vd->vdev_children > 0) strval = kmem_zalloc(ZAP_MAXVALUELEN, KM_SLEEP); for (uint64_t i = 0; i < vd->vdev_children; i++) { const char *vname; vname = vdev_name(vd->vdev_child[i], namebuf, sizeof (namebuf)); if (vname == NULL) vname = "(unknown)"; if (strlen(strval) > 0) strlcat(strval, ",", ZAP_MAXVALUELEN); strlcat(strval, vname, ZAP_MAXVALUELEN); } if (strval != NULL) { vdev_prop_add_list(outnvl, propname, strval, 0, ZPROP_SRC_NONE); kmem_free(strval, ZAP_MAXVALUELEN); } continue; case VDEV_PROP_NUMCHILDREN: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_children, ZPROP_SRC_NONE); continue; case VDEV_PROP_READ_ERRORS: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_read_errors, ZPROP_SRC_NONE); continue; case VDEV_PROP_WRITE_ERRORS: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_write_errors, ZPROP_SRC_NONE); continue; case VDEV_PROP_CHECKSUM_ERRORS: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_checksum_errors, ZPROP_SRC_NONE); continue; case VDEV_PROP_INITIALIZE_ERRORS: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_initialize_errors, ZPROP_SRC_NONE); continue; case VDEV_PROP_OPS_NULL: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_ops[ZIO_TYPE_NULL], ZPROP_SRC_NONE); continue; case VDEV_PROP_OPS_READ: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_ops[ZIO_TYPE_READ], ZPROP_SRC_NONE); continue; case VDEV_PROP_OPS_WRITE: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_ops[ZIO_TYPE_WRITE], ZPROP_SRC_NONE); continue; case VDEV_PROP_OPS_FREE: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_ops[ZIO_TYPE_FREE], ZPROP_SRC_NONE); continue; case VDEV_PROP_OPS_CLAIM: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_ops[ZIO_TYPE_CLAIM], ZPROP_SRC_NONE); continue; case VDEV_PROP_OPS_TRIM: /* * TRIM ops and bytes are reported to user * space as ZIO_TYPE_IOCTL. This is done to * preserve the vdev_stat_t structure layout * for user space. */ vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_ops[ZIO_TYPE_IOCTL], ZPROP_SRC_NONE); continue; case VDEV_PROP_BYTES_NULL: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_bytes[ZIO_TYPE_NULL], ZPROP_SRC_NONE); continue; case VDEV_PROP_BYTES_READ: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_bytes[ZIO_TYPE_READ], ZPROP_SRC_NONE); continue; case VDEV_PROP_BYTES_WRITE: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_bytes[ZIO_TYPE_WRITE], ZPROP_SRC_NONE); continue; case VDEV_PROP_BYTES_FREE: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_bytes[ZIO_TYPE_FREE], ZPROP_SRC_NONE); continue; case VDEV_PROP_BYTES_CLAIM: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_bytes[ZIO_TYPE_CLAIM], ZPROP_SRC_NONE); continue; case VDEV_PROP_BYTES_TRIM: /* * TRIM ops and bytes are reported to user * space as ZIO_TYPE_IOCTL. This is done to * preserve the vdev_stat_t structure layout * for user space. */ vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_bytes[ZIO_TYPE_IOCTL], ZPROP_SRC_NONE); continue; case VDEV_PROP_REMOVING: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_removing, ZPROP_SRC_NONE); continue; /* Numeric Properites */ case VDEV_PROP_ALLOCATING: src = ZPROP_SRC_LOCAL; strval = NULL; err = zap_lookup(mos, objid, nvpair_name(elem), sizeof (uint64_t), 1, &intval); if (err == ENOENT) { intval = vdev_prop_default_numeric(prop); err = 0; } else if (err) break; if (intval == vdev_prop_default_numeric(prop)) src = ZPROP_SRC_DEFAULT; /* Leaf vdevs cannot have this property */ if (vd->vdev_mg == NULL && vd->vdev_top != NULL) { src = ZPROP_SRC_NONE; intval = ZPROP_BOOLEAN_NA; } vdev_prop_add_list(outnvl, propname, strval, intval, src); break; /* Text Properties */ case VDEV_PROP_COMMENT: /* Exists in the ZAP below */ /* FALLTHRU */ case VDEV_PROP_USERPROP: /* User Properites */ src = ZPROP_SRC_LOCAL; err = zap_length(mos, objid, nvpair_name(elem), &integer_size, &num_integers); if (err) break; switch (integer_size) { case 8: /* User properties cannot be integers */ err = EINVAL; break; case 1: /* string property */ strval = kmem_alloc(num_integers, KM_SLEEP); err = zap_lookup(mos, objid, nvpair_name(elem), 1, num_integers, strval); if (err) { kmem_free(strval, num_integers); break; } vdev_prop_add_list(outnvl, propname, strval, 0, src); kmem_free(strval, num_integers); break; } break; default: err = ENOENT; break; } if (err) break; } } else { /* * Get all properties from the MOS vdev property object. */ zap_cursor_t zc; zap_attribute_t za; for (zap_cursor_init(&zc, mos, objid); (err = zap_cursor_retrieve(&zc, &za)) == 0; zap_cursor_advance(&zc)) { intval = 0; strval = NULL; zprop_source_t src = ZPROP_SRC_DEFAULT; propname = za.za_name; - prop = vdev_name_to_prop(propname); switch (za.za_integer_length) { case 8: /* We do not allow integer user properties */ /* This is likely an internal value */ break; case 1: /* string property */ strval = kmem_alloc(za.za_num_integers, KM_SLEEP); err = zap_lookup(mos, objid, za.za_name, 1, za.za_num_integers, strval); if (err) { kmem_free(strval, za.za_num_integers); break; } vdev_prop_add_list(outnvl, propname, strval, 0, src); kmem_free(strval, za.za_num_integers); break; default: break; } } zap_cursor_fini(&zc); } mutex_exit(&spa->spa_props_lock); if (err && err != ENOENT) { return (err); } return (0); } EXPORT_SYMBOL(vdev_fault); EXPORT_SYMBOL(vdev_degrade); EXPORT_SYMBOL(vdev_online); EXPORT_SYMBOL(vdev_offline); EXPORT_SYMBOL(vdev_clear); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, default_ms_count, UINT, ZMOD_RW, "Target number of metaslabs per top-level vdev"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, default_ms_shift, UINT, ZMOD_RW, "Default limit for metaslab size"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, min_ms_count, UINT, ZMOD_RW, "Minimum number of metaslabs per top-level vdev"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, ms_count_limit, UINT, ZMOD_RW, "Practical upper limit of total metaslabs per top-level vdev"); ZFS_MODULE_PARAM(zfs, zfs_, slow_io_events_per_second, UINT, ZMOD_RW, "Rate limit slow IO (delay) events to this many per second"); /* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs, zfs_, checksum_events_per_second, UINT, ZMOD_RW, "Rate limit checksum events to this many checksum errors per second " "(do not set below ZED threshold)."); /* END CSTYLED */ ZFS_MODULE_PARAM(zfs, zfs_, scan_ignore_errors, INT, ZMOD_RW, "Ignore errors during resilver/scrub"); ZFS_MODULE_PARAM(zfs_vdev, vdev_, validate_skip, INT, ZMOD_RW, "Bypass vdev_validate()"); ZFS_MODULE_PARAM(zfs, zfs_, nocacheflush, INT, ZMOD_RW, "Disable cache flushes"); ZFS_MODULE_PARAM(zfs, zfs_, embedded_slog_min_ms, UINT, ZMOD_RW, "Minimum number of metaslabs required to dedicate one for log blocks"); /* BEGIN CSTYLED */ ZFS_MODULE_PARAM_CALL(zfs_vdev, zfs_vdev_, min_auto_ashift, param_set_min_auto_ashift, param_get_uint, ZMOD_RW, "Minimum ashift used when creating new top-level vdevs"); ZFS_MODULE_PARAM_CALL(zfs_vdev, zfs_vdev_, max_auto_ashift, param_set_max_auto_ashift, param_get_uint, ZMOD_RW, "Maximum ashift used when optimizing for logical -> physical sector " "size on new top-level vdevs"); /* END CSTYLED */ diff --git a/module/zfs/zcp_get.c b/module/zfs/zcp_get.c index cd17374eb422..f28266b8095f 100644 --- a/module/zfs/zcp_get.c +++ b/module/zfs/zcp_get.c @@ -1,809 +1,811 @@ /* * CDDL HEADER START * * This file and its contents are supplied under the terms of the * Common Development and Distribution License ("CDDL"), version 1.0. * You may only use this file in accordance with the terms of version * 1.0 of the CDDL. * * A full copy of the text of the CDDL should have accompanied this * source. A copy of the CDDL is also available via the Internet at * http://www.illumos.org/license/CDDL. * * CDDL HEADER END */ /* * Copyright (c) 2016 by Delphix. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef _KERNEL #include #include #endif static int get_objset_type(dsl_dataset_t *ds, zfs_type_t *type) { int error; objset_t *os; error = dmu_objset_from_ds(ds, &os); if (error != 0) return (error); if (ds->ds_is_snapshot) { *type = ZFS_TYPE_SNAPSHOT; } else { switch (os->os_phys->os_type) { case DMU_OST_ZFS: *type = ZFS_TYPE_FILESYSTEM; break; case DMU_OST_ZVOL: *type = ZFS_TYPE_VOLUME; break; default: return (EINVAL); } } return (0); } /* * Returns the string name of ds's type in str (a buffer which should be * at least 12 bytes long). */ static int get_objset_type_name(dsl_dataset_t *ds, char *str) { zfs_type_t type = ZFS_TYPE_INVALID; int error = get_objset_type(ds, &type); if (error != 0) return (error); switch (type) { case ZFS_TYPE_SNAPSHOT: (void) strlcpy(str, "snapshot", ZAP_MAXVALUELEN); break; case ZFS_TYPE_FILESYSTEM: (void) strlcpy(str, "filesystem", ZAP_MAXVALUELEN); break; case ZFS_TYPE_VOLUME: (void) strlcpy(str, "volume", ZAP_MAXVALUELEN); break; default: return (EINVAL); } return (0); } /* * Determines the source of a property given its setpoint and * property type. It pushes the source to the lua stack. */ static void get_prop_src(lua_State *state, const char *setpoint, zfs_prop_t prop) { if (zfs_prop_readonly(prop) || (prop == ZFS_PROP_VERSION)) { lua_pushnil(state); } else { const char *src; if (strcmp("", setpoint) == 0) { src = "default"; } else { src = setpoint; } (void) lua_pushstring(state, src); } } /* * Given an error encountered while getting properties, either longjmp's for * a fatal error or pushes nothing to the stack for a non fatal one. */ static int zcp_handle_error(lua_State *state, const char *dataset_name, const char *property_name, int error) { ASSERT3S(error, !=, 0); if (error == ENOENT) { return (0); } else if (error == EINVAL) { return (luaL_error(state, "property '%s' is not a valid property on dataset '%s'", property_name, dataset_name)); } else if (error == EIO) { return (luaL_error(state, "I/O error while retrieving property '%s' on dataset '%s'", property_name, dataset_name)); } else { return (luaL_error(state, "unexpected error %d while " "retrieving property '%s' on dataset '%s'", error, property_name, dataset_name)); } } /* * Look up a user defined property in the zap object. If it exists, push it * and the setpoint onto the stack, otherwise don't push anything. */ static int zcp_get_user_prop(lua_State *state, dsl_pool_t *dp, const char *dataset_name, const char *property_name) { int error; char *buf; char setpoint[ZFS_MAX_DATASET_NAME_LEN]; /* * zcp_dataset_hold will either successfully return the requested * dataset or throw a lua error and longjmp out of the zfs.get_prop call * without returning. */ dsl_dataset_t *ds = zcp_dataset_hold(state, dp, dataset_name, FTAG); if (ds == NULL) return (1); /* not reached; zcp_dataset_hold() longjmp'd */ buf = kmem_alloc(ZAP_MAXVALUELEN, KM_SLEEP); error = dsl_prop_get_ds(ds, property_name, 1, ZAP_MAXVALUELEN, buf, setpoint); dsl_dataset_rele(ds, FTAG); if (error != 0) { kmem_free(buf, ZAP_MAXVALUELEN); return (zcp_handle_error(state, dataset_name, property_name, error)); } (void) lua_pushstring(state, buf); (void) lua_pushstring(state, setpoint); kmem_free(buf, ZAP_MAXVALUELEN); return (2); } /* * Check if the property we're looking for is stored in the ds_dir. If so, * return it in the 'val' argument. Return 0 on success and ENOENT and if * the property is not present. */ static int get_dsl_dir_prop(dsl_dataset_t *ds, zfs_prop_t zfs_prop, uint64_t *val) { dsl_dir_t *dd = ds->ds_dir; mutex_enter(&dd->dd_lock); switch (zfs_prop) { case ZFS_PROP_USEDSNAP: *val = dsl_dir_get_usedsnap(dd); break; case ZFS_PROP_USEDCHILD: *val = dsl_dir_get_usedchild(dd); break; case ZFS_PROP_USEDDS: *val = dsl_dir_get_usedds(dd); break; case ZFS_PROP_USEDREFRESERV: *val = dsl_dir_get_usedrefreserv(dd); break; case ZFS_PROP_LOGICALUSED: *val = dsl_dir_get_logicalused(dd); break; default: mutex_exit(&dd->dd_lock); return (SET_ERROR(ENOENT)); } mutex_exit(&dd->dd_lock); return (0); } /* * Check if the property we're looking for is stored at the dsl_dataset or * dsl_dir level. If so, push the property value and source onto the lua stack * and return 0. If it is not present or a failure occurs in lookup, return a * non-zero error value. */ static int get_special_prop(lua_State *state, dsl_dataset_t *ds, const char *dsname, zfs_prop_t zfs_prop) { int error = 0; objset_t *os; uint64_t numval = 0; char *strval = kmem_alloc(ZAP_MAXVALUELEN, KM_SLEEP); char setpoint[ZFS_MAX_DATASET_NAME_LEN] = "Internal error - setpoint not determined"; zfs_type_t ds_type = ZFS_TYPE_INVALID; zprop_type_t prop_type = zfs_prop_get_type(zfs_prop); (void) get_objset_type(ds, &ds_type); switch (zfs_prop) { case ZFS_PROP_REFRATIO: numval = dsl_get_refratio(ds); break; case ZFS_PROP_USED: numval = dsl_get_used(ds); break; case ZFS_PROP_CLONES: { nvlist_t *clones = fnvlist_alloc(); error = get_clones_stat_impl(ds, clones); if (error == 0) { /* push list to lua stack */ VERIFY0(zcp_nvlist_to_lua(state, clones, NULL, 0ULL)); /* source */ (void) lua_pushnil(state); } nvlist_free(clones); kmem_free(strval, ZAP_MAXVALUELEN); return (error); } case ZFS_PROP_COMPRESSRATIO: numval = dsl_get_compressratio(ds); break; case ZFS_PROP_CREATION: numval = dsl_get_creation(ds); break; case ZFS_PROP_REFERENCED: numval = dsl_get_referenced(ds); break; case ZFS_PROP_AVAILABLE: numval = dsl_get_available(ds); break; case ZFS_PROP_LOGICALREFERENCED: numval = dsl_get_logicalreferenced(ds); break; case ZFS_PROP_CREATETXG: numval = dsl_get_creationtxg(ds); break; case ZFS_PROP_GUID: numval = dsl_get_guid(ds); break; case ZFS_PROP_UNIQUE: numval = dsl_get_unique(ds); break; case ZFS_PROP_OBJSETID: numval = dsl_get_objsetid(ds); break; case ZFS_PROP_ORIGIN: dsl_dir_get_origin(ds->ds_dir, strval); break; case ZFS_PROP_USERACCOUNTING: error = dmu_objset_from_ds(ds, &os); if (error == 0) numval = dmu_objset_userspace_present(os); break; case ZFS_PROP_WRITTEN: error = dsl_get_written(ds, &numval); break; case ZFS_PROP_TYPE: error = get_objset_type_name(ds, strval); break; case ZFS_PROP_PREV_SNAP: error = dsl_get_prev_snap(ds, strval); break; case ZFS_PROP_NAME: dsl_dataset_name(ds, strval); break; case ZFS_PROP_MOUNTPOINT: error = dsl_get_mountpoint(ds, dsname, strval, setpoint); break; case ZFS_PROP_VERSION: /* should be a snapshot or filesystem */ ASSERT(ds_type != ZFS_TYPE_VOLUME); error = dmu_objset_from_ds(ds, &os); /* look in the master node for the version */ if (error == 0) { error = zap_lookup(os, MASTER_NODE_OBJ, ZPL_VERSION_STR, sizeof (numval), 1, &numval); } break; case ZFS_PROP_DEFER_DESTROY: numval = dsl_get_defer_destroy(ds); break; case ZFS_PROP_USERREFS: numval = dsl_get_userrefs(ds); break; case ZFS_PROP_FILESYSTEM_COUNT: error = dsl_dir_get_filesystem_count(ds->ds_dir, &numval); (void) strlcpy(setpoint, "", ZFS_MAX_DATASET_NAME_LEN); break; case ZFS_PROP_SNAPSHOT_COUNT: error = dsl_dir_get_snapshot_count(ds->ds_dir, &numval); (void) strlcpy(setpoint, "", ZFS_MAX_DATASET_NAME_LEN); break; case ZFS_PROP_NUMCLONES: numval = dsl_get_numclones(ds); break; case ZFS_PROP_INCONSISTENT: numval = dsl_get_inconsistent(ds); break; case ZFS_PROP_IVSET_GUID: if (dsl_dataset_is_zapified(ds)) { error = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset, ds->ds_object, DS_FIELD_IVSET_GUID, sizeof (numval), 1, &numval); } else { error = ENOENT; } break; case ZFS_PROP_RECEIVE_RESUME_TOKEN: { char *token = get_receive_resume_token(ds); if (token != NULL) { (void) strlcpy(strval, token, ZAP_MAXVALUELEN); kmem_strfree(token); } else { error = ENOENT; } break; } case ZFS_PROP_VOLSIZE: ASSERT(ds_type == ZFS_TYPE_VOLUME || ds_type == ZFS_TYPE_SNAPSHOT); error = dmu_objset_from_ds(ds, &os); if (error == 0) { error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", sizeof (numval), 1, &numval); } if (error == 0) (void) strlcpy(setpoint, dsname, ZFS_MAX_DATASET_NAME_LEN); break; case ZFS_PROP_VOLBLOCKSIZE: { ASSERT(ds_type == ZFS_TYPE_VOLUME); dmu_object_info_t doi; error = dmu_objset_from_ds(ds, &os); if (error == 0) { error = dmu_object_info(os, ZVOL_OBJ, &doi); if (error == 0) numval = doi.doi_data_block_size; } break; } case ZFS_PROP_KEYSTATUS: case ZFS_PROP_KEYFORMAT: { /* provide defaults in case no crypto obj exists */ setpoint[0] = '\0'; if (zfs_prop == ZFS_PROP_KEYSTATUS) numval = ZFS_KEYSTATUS_NONE; else numval = ZFS_KEYFORMAT_NONE; nvlist_t *nvl, *propval; nvl = fnvlist_alloc(); dsl_dataset_crypt_stats(ds, nvl); if (nvlist_lookup_nvlist(nvl, zfs_prop_to_name(zfs_prop), &propval) == 0) { char *source; (void) nvlist_lookup_uint64(propval, ZPROP_VALUE, &numval); if (nvlist_lookup_string(propval, ZPROP_SOURCE, &source) == 0) strlcpy(setpoint, source, sizeof (setpoint)); } nvlist_free(nvl); break; } case ZFS_PROP_SNAPSHOTS_CHANGED: numval = dsl_dir_snap_cmtime(ds->ds_dir).tv_sec; break; default: /* Did not match these props, check in the dsl_dir */ error = get_dsl_dir_prop(ds, zfs_prop, &numval); } if (error != 0) { kmem_free(strval, ZAP_MAXVALUELEN); return (error); } switch (prop_type) { case PROP_TYPE_NUMBER: { (void) lua_pushnumber(state, numval); break; } case PROP_TYPE_STRING: { (void) lua_pushstring(state, strval); break; } case PROP_TYPE_INDEX: { const char *propval; error = zfs_prop_index_to_string(zfs_prop, numval, &propval); if (error != 0) { kmem_free(strval, ZAP_MAXVALUELEN); return (error); } (void) lua_pushstring(state, propval); break; } } kmem_free(strval, ZAP_MAXVALUELEN); /* Push the source to the stack */ get_prop_src(state, setpoint, zfs_prop); return (0); } /* * Look up a property and its source in the zap object. If the value is * present and successfully retrieved, push the value and source on the * lua stack and return 0. On failure, return a non-zero error value. */ static int get_zap_prop(lua_State *state, dsl_dataset_t *ds, zfs_prop_t zfs_prop) { int error = 0; char setpoint[ZFS_MAX_DATASET_NAME_LEN]; char *strval = kmem_alloc(ZAP_MAXVALUELEN, KM_SLEEP); uint64_t numval; const char *prop_name = zfs_prop_to_name(zfs_prop); zprop_type_t prop_type = zfs_prop_get_type(zfs_prop); if (prop_type == PROP_TYPE_STRING) { /* Push value to lua stack */ error = dsl_prop_get_ds(ds, prop_name, 1, ZAP_MAXVALUELEN, strval, setpoint); if (error == 0) (void) lua_pushstring(state, strval); } else { error = dsl_prop_get_ds(ds, prop_name, sizeof (numval), 1, &numval, setpoint); - + if (error != 0) + goto out; #ifdef _KERNEL /* Fill in temporary value for prop, if applicable */ (void) zfs_get_temporary_prop(ds, zfs_prop, &numval, setpoint); #else kmem_free(strval, ZAP_MAXVALUELEN); return (luaL_error(state, "temporary properties only supported in kernel mode", prop_name)); #endif /* Push value to lua stack */ if (prop_type == PROP_TYPE_INDEX) { const char *propval; error = zfs_prop_index_to_string(zfs_prop, numval, &propval); if (error == 0) (void) lua_pushstring(state, propval); } else { if (error == 0) (void) lua_pushnumber(state, numval); } } +out: kmem_free(strval, ZAP_MAXVALUELEN); if (error == 0) get_prop_src(state, setpoint, zfs_prop); return (error); } /* * Determine whether property is valid for a given dataset */ boolean_t prop_valid_for_ds(dsl_dataset_t *ds, zfs_prop_t zfs_prop) { zfs_type_t zfs_type = ZFS_TYPE_INVALID; /* properties not supported */ if ((zfs_prop == ZFS_PROP_ISCSIOPTIONS) || (zfs_prop == ZFS_PROP_MOUNTED)) return (B_FALSE); /* if we want the origin prop, ds must be a clone */ if ((zfs_prop == ZFS_PROP_ORIGIN) && (!dsl_dir_is_clone(ds->ds_dir))) return (B_FALSE); int error = get_objset_type(ds, &zfs_type); if (error != 0) return (B_FALSE); return (zfs_prop_valid_for_type(zfs_prop, zfs_type, B_FALSE)); } /* * Look up a given dataset property. On success return 2, the number of * values pushed to the lua stack (property value and source). On a fatal * error, longjmp. On a non fatal error push nothing. */ static int zcp_get_system_prop(lua_State *state, dsl_pool_t *dp, const char *dataset_name, zfs_prop_t zfs_prop) { int error; /* * zcp_dataset_hold will either successfully return the requested * dataset or throw a lua error and longjmp out of the zfs.get_prop call * without returning. */ dsl_dataset_t *ds = zcp_dataset_hold(state, dp, dataset_name, FTAG); if (ds == NULL) return (1); /* not reached; zcp_dataset_hold() longjmp'd */ /* Check that the property is valid for the given dataset */ const char *prop_name = zfs_prop_to_name(zfs_prop); if (!prop_valid_for_ds(ds, zfs_prop)) { dsl_dataset_rele(ds, FTAG); return (0); } /* Check if the property can be accessed directly */ error = get_special_prop(state, ds, dataset_name, zfs_prop); if (error == 0) { dsl_dataset_rele(ds, FTAG); /* The value and source have been pushed by get_special_prop */ return (2); } if (error != ENOENT) { dsl_dataset_rele(ds, FTAG); return (zcp_handle_error(state, dataset_name, prop_name, error)); } /* If we were unable to find it, look in the zap object */ error = get_zap_prop(state, ds, zfs_prop); dsl_dataset_rele(ds, FTAG); if (error != 0) { return (zcp_handle_error(state, dataset_name, prop_name, error)); } /* The value and source have been pushed by get_zap_prop */ return (2); } #ifdef _KERNEL static zfs_userquota_prop_t get_userquota_prop(const char *prop_name) { zfs_userquota_prop_t type; /* Figure out the property type ({user|group}{quota|used}) */ for (type = 0; type < ZFS_NUM_USERQUOTA_PROPS; type++) { if (strncmp(prop_name, zfs_userquota_prop_prefixes[type], strlen(zfs_userquota_prop_prefixes[type])) == 0) break; } return (type); } /* * Given the name of a zfs_userquota_prop, this function determines the * prop type as well as the numeric group/user ids based on the string * following the '@' in the property name. On success, returns 0. On failure, * returns a non-zero error. * 'domain' must be free'd by caller using kmem_strfree() */ static int parse_userquota_prop(const char *prop_name, zfs_userquota_prop_t *type, char **domain, uint64_t *rid) { char *cp, *end, *domain_val; *type = get_userquota_prop(prop_name); if (*type >= ZFS_NUM_USERQUOTA_PROPS) return (EINVAL); *rid = 0; cp = strchr(prop_name, '@') + 1; if (strncmp(cp, "S-1-", 4) == 0) { /* * It's a numeric SID (eg "S-1-234-567-89") and we want to * separate the domain id and the rid */ int domain_len = strrchr(cp, '-') - cp; domain_val = kmem_alloc(domain_len + 1, KM_SLEEP); (void) strlcpy(domain_val, cp, domain_len + 1); cp += domain_len + 1; (void) ddi_strtoll(cp, &end, 10, (longlong_t *)rid); if (*end != '\0') { kmem_strfree(domain_val); return (EINVAL); } } else { /* It's only a user/group ID (eg "12345"), just get the rid */ domain_val = NULL; (void) ddi_strtoll(cp, &end, 10, (longlong_t *)rid); if (*end != '\0') return (EINVAL); } *domain = domain_val; return (0); } /* * Look up {user|group}{quota|used} property for given dataset. On success * push the value (quota or used amount) and the setpoint. On failure, push * a lua error. */ static int zcp_get_userquota_prop(lua_State *state, dsl_pool_t *dp, const char *dataset_name, const char *prop_name) { zfsvfs_t *zfvp; zfsvfs_t *zfsvfs; int error; zfs_userquota_prop_t type; char *domain; uint64_t rid, value = 0; objset_t *os; dsl_dataset_t *ds = zcp_dataset_hold(state, dp, dataset_name, FTAG); if (ds == NULL) return (1); /* not reached; zcp_dataset_hold() longjmp'd */ error = parse_userquota_prop(prop_name, &type, &domain, &rid); if (error == 0) { error = dmu_objset_from_ds(ds, &os); if (error == 0) { zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); error = zfsvfs_create_impl(&zfvp, zfsvfs, os); if (error == 0) { error = zfs_userspace_one(zfvp, type, domain, rid, &value); zfsvfs_free(zfvp); } } if (domain != NULL) kmem_strfree(domain); } dsl_dataset_rele(ds, FTAG); if ((value == 0) && ((type == ZFS_PROP_USERQUOTA) || (type == ZFS_PROP_GROUPQUOTA))) error = SET_ERROR(ENOENT); if (error != 0) { return (zcp_handle_error(state, dataset_name, prop_name, error)); } (void) lua_pushnumber(state, value); (void) lua_pushstring(state, dataset_name); return (2); } #endif /* * Determines the name of the snapshot referenced in the written property * name. Returns snapshot name in snap_name, a buffer that must be at least * as large as ZFS_MAX_DATASET_NAME_LEN */ static void parse_written_prop(const char *dataset_name, const char *prop_name, char *snap_name) { ASSERT(zfs_prop_written(prop_name)); const char *name = prop_name + ZFS_WRITTEN_PROP_PREFIX_LEN; if (strchr(name, '@') == NULL) { (void) snprintf(snap_name, ZFS_MAX_DATASET_NAME_LEN, "%s@%s", dataset_name, name); } else { (void) strlcpy(snap_name, name, ZFS_MAX_DATASET_NAME_LEN); } } /* * Look up written@ property for given dataset. On success * push the value and the setpoint. If error is fatal, we will * longjmp, otherwise push nothing. */ static int zcp_get_written_prop(lua_State *state, dsl_pool_t *dp, const char *dataset_name, const char *prop_name) { char snap_name[ZFS_MAX_DATASET_NAME_LEN]; uint64_t used, comp, uncomp; dsl_dataset_t *old; int error = 0; parse_written_prop(dataset_name, prop_name, snap_name); dsl_dataset_t *new = zcp_dataset_hold(state, dp, dataset_name, FTAG); if (new == NULL) return (1); /* not reached; zcp_dataset_hold() longjmp'd */ error = dsl_dataset_hold(dp, snap_name, FTAG, &old); if (error != 0) { dsl_dataset_rele(new, FTAG); return (zcp_dataset_hold_error(state, dp, snap_name, error)); } error = dsl_dataset_space_written(old, new, &used, &comp, &uncomp); dsl_dataset_rele(old, FTAG); dsl_dataset_rele(new, FTAG); if (error != 0) { return (zcp_handle_error(state, dataset_name, snap_name, error)); } (void) lua_pushnumber(state, used); (void) lua_pushstring(state, dataset_name); return (2); } static int zcp_get_prop(lua_State *state); static const zcp_lib_info_t zcp_get_prop_info = { .name = "get_prop", .func = zcp_get_prop, .pargs = { { .za_name = "dataset", .za_lua_type = LUA_TSTRING }, { .za_name = "property", .za_lua_type = LUA_TSTRING }, {NULL, 0} }, .kwargs = { {NULL, 0} } }; static int zcp_get_prop(lua_State *state) { const char *dataset_name; const char *property_name; dsl_pool_t *dp = zcp_run_info(state)->zri_pool; const zcp_lib_info_t *libinfo = &zcp_get_prop_info; zcp_parse_args(state, libinfo->name, libinfo->pargs, libinfo->kwargs); dataset_name = lua_tostring(state, 1); property_name = lua_tostring(state, 2); /* User defined property */ if (zfs_prop_user(property_name)) { return (zcp_get_user_prop(state, dp, dataset_name, property_name)); } /* userspace property */ if (zfs_prop_userquota(property_name)) { #ifdef _KERNEL return (zcp_get_userquota_prop(state, dp, dataset_name, property_name)); #else return (luaL_error(state, "user quota properties only supported in kernel mode", property_name)); #endif } /* written@ property */ if (zfs_prop_written(property_name)) { return (zcp_get_written_prop(state, dp, dataset_name, property_name)); } zfs_prop_t zfs_prop = zfs_name_to_prop(property_name); /* Valid system property */ if (zfs_prop != ZPROP_INVAL) { return (zcp_get_system_prop(state, dp, dataset_name, zfs_prop)); } /* Invalid property name */ return (luaL_error(state, "'%s' is not a valid property", property_name)); } int zcp_load_get_lib(lua_State *state) { lua_pushcclosure(state, zcp_get_prop_info.func, 0); lua_setfield(state, -2, zcp_get_prop_info.name); return (1); } diff --git a/tests/zfs-tests/cmd/btree_test.c b/tests/zfs-tests/cmd/btree_test.c index ab8967b22b22..fb9de9c77787 100644 --- a/tests/zfs-tests/cmd/btree_test.c +++ b/tests/zfs-tests/cmd/btree_test.c @@ -1,568 +1,557 @@ /* * This file and its contents are supplied under the terms of the * Common Development and Distribution License ("CDDL"), version 1.0. * You may only use this file in accordance with the terms of version * 1.0 of the CDDL. * * A full copy of the text of the CDDL should have accompanied this * source. A copy of the CDDL is also available via the Internet at * http://www.illumos.org/license/CDDL. */ /* * Copyright (c) 2019 by Delphix. All rights reserved. */ #include #include #include #include #include #include #include #define BUFSIZE 256 int seed = 0; int stress_timeout = 180; int contents_frequency = 100; int tree_limit = 64 * 1024; boolean_t stress_only = B_FALSE; static void usage(int exit_value) { (void) fprintf(stderr, "Usage:\tbtree_test -n \n"); (void) fprintf(stderr, "\tbtree_test -s [-r ] [-l ] " "[-t timeout>] [-c check_contents]\n"); (void) fprintf(stderr, "\tbtree_test [-r ] [-l ] " "[-t timeout>] [-c check_contents]\n"); (void) fprintf(stderr, "\n With the -n option, run the named " "negative test. With the -s option,\n"); (void) fprintf(stderr, " run the stress test according to the " "other options passed. With\n"); (void) fprintf(stderr, " neither, run all the positive tests, " "including the stress test with\n"); (void) fprintf(stderr, " the default options.\n"); (void) fprintf(stderr, "\n Options that control the stress test\n"); (void) fprintf(stderr, "\t-c stress iterations after which to compare " "tree contents [default: 100]\n"); (void) fprintf(stderr, "\t-l the largest value to allow in the tree " "[default: 1M]\n"); (void) fprintf(stderr, "\t-r random seed [default: from " "gettimeofday()]\n"); (void) fprintf(stderr, "\t-t seconds to let the stress test run " "[default: 180]\n"); exit(exit_value); } typedef struct int_node { avl_node_t node; uint64_t data; } int_node_t; /* * Utility functions */ static int avl_compare(const void *v1, const void *v2) { const int_node_t *n1 = v1; const int_node_t *n2 = v2; uint64_t a = n1->data; uint64_t b = n2->data; return (TREE_CMP(a, b)); } static int zfs_btree_compare(const void *v1, const void *v2) { const uint64_t *a = v1; const uint64_t *b = v2; return (TREE_CMP(*a, *b)); } static void verify_contents(avl_tree_t *avl, zfs_btree_t *bt) { static int count = 0; zfs_btree_index_t bt_idx = {0}; int_node_t *node; uint64_t *data; boolean_t forward = count % 2 == 0 ? B_TRUE : B_FALSE; count++; ASSERT3U(avl_numnodes(avl), ==, zfs_btree_numnodes(bt)); if (forward == B_TRUE) { node = avl_first(avl); data = zfs_btree_first(bt, &bt_idx); } else { node = avl_last(avl); data = zfs_btree_last(bt, &bt_idx); } while (node != NULL) { ASSERT3U(*data, ==, node->data); if (forward == B_TRUE) { data = zfs_btree_next(bt, &bt_idx, &bt_idx); node = AVL_NEXT(avl, node); } else { data = zfs_btree_prev(bt, &bt_idx, &bt_idx); node = AVL_PREV(avl, node); } } } static void verify_node(avl_tree_t *avl, zfs_btree_t *bt, int_node_t *node) { zfs_btree_index_t bt_idx = {0}; zfs_btree_index_t bt_idx2 = {0}; int_node_t *inp; uint64_t data = node->data; uint64_t *rv = NULL; ASSERT3U(avl_numnodes(avl), ==, zfs_btree_numnodes(bt)); ASSERT3P((rv = (uint64_t *)zfs_btree_find(bt, &data, &bt_idx)), !=, NULL); ASSERT3S(*rv, ==, data); ASSERT3P(zfs_btree_get(bt, &bt_idx), !=, NULL); ASSERT3S(data, ==, *(uint64_t *)zfs_btree_get(bt, &bt_idx)); if ((inp = AVL_NEXT(avl, node)) != NULL) { ASSERT3P((rv = zfs_btree_next(bt, &bt_idx, &bt_idx2)), !=, NULL); ASSERT3P(rv, ==, zfs_btree_get(bt, &bt_idx2)); ASSERT3S(inp->data, ==, *rv); } else { ASSERT3U(data, ==, *(uint64_t *)zfs_btree_last(bt, &bt_idx)); } if ((inp = AVL_PREV(avl, node)) != NULL) { ASSERT3P((rv = zfs_btree_prev(bt, &bt_idx, &bt_idx2)), !=, NULL); ASSERT3P(rv, ==, zfs_btree_get(bt, &bt_idx2)); ASSERT3S(inp->data, ==, *rv); } else { ASSERT3U(data, ==, *(uint64_t *)zfs_btree_first(bt, &bt_idx)); } } /* * Tests */ /* Verify that zfs_btree_find works correctly with a NULL index. */ static int find_without_index(zfs_btree_t *bt, char *why) { u_longlong_t *p, i = 12345; zfs_btree_add(bt, &i); if ((p = (u_longlong_t *)zfs_btree_find(bt, &i, NULL)) == NULL || *p != i) { (void) snprintf(why, BUFSIZE, "Unexpectedly found %llu\n", p == NULL ? 0 : *p); return (1); } i++; if ((p = (u_longlong_t *)zfs_btree_find(bt, &i, NULL)) != NULL) { (void) snprintf(why, BUFSIZE, "Found bad value: %llu\n", *p); return (1); } return (0); } /* Verify simple insertion and removal from the tree. */ static int insert_find_remove(zfs_btree_t *bt, char *why) { u_longlong_t *p, i = 12345; zfs_btree_index_t bt_idx = {0}; /* Insert 'i' into the tree, and attempt to find it again. */ zfs_btree_add(bt, &i); if ((p = (u_longlong_t *)zfs_btree_find(bt, &i, &bt_idx)) == NULL) { (void) snprintf(why, BUFSIZE, "Didn't find value in tree\n"); return (1); } else if (*p != i) { (void) snprintf(why, BUFSIZE, "Found (%llu) in tree\n", *p); return (1); } ASSERT3S(zfs_btree_numnodes(bt), ==, 1); zfs_btree_verify(bt); /* Remove 'i' from the tree, and verify it is not found. */ zfs_btree_remove(bt, &i); if ((p = (u_longlong_t *)zfs_btree_find(bt, &i, &bt_idx)) != NULL) { (void) snprintf(why, BUFSIZE, "Found removed value (%llu)\n", *p); return (1); } ASSERT3S(zfs_btree_numnodes(bt), ==, 0); zfs_btree_verify(bt); return (0); } /* * Add a number of random entries into a btree and avl tree. Then walk them * backwards and forwards while emptying the tree, verifying the trees look * the same. */ static int drain_tree(zfs_btree_t *bt, char *why) { - uint64_t *p; avl_tree_t avl; int i = 0; int_node_t *node; avl_index_t avl_idx = {0}; zfs_btree_index_t bt_idx = {0}; avl_create(&avl, avl_compare, sizeof (int_node_t), offsetof(int_node_t, node)); /* Fill both trees with the same data */ for (i = 0; i < 64 * 1024; i++) { - void *ret; - u_longlong_t randval = random(); - if ((p = (uint64_t *)zfs_btree_find(bt, &randval, &bt_idx)) != - NULL) { + if (zfs_btree_find(bt, &randval, &bt_idx) != NULL) { continue; } zfs_btree_add_idx(bt, &randval, &bt_idx); node = malloc(sizeof (int_node_t)); if (node == NULL) { perror("malloc"); exit(EXIT_FAILURE); } node->data = randval; - if ((ret = avl_find(&avl, node, &avl_idx)) != NULL) { + if (avl_find(&avl, node, &avl_idx) != NULL) { (void) snprintf(why, BUFSIZE, "Found in avl: %llu\n", randval); return (1); } avl_insert(&avl, node, avl_idx); } /* Remove data from either side of the trees, comparing the data */ while (avl_numnodes(&avl) != 0) { uint64_t *data; ASSERT3U(avl_numnodes(&avl), ==, zfs_btree_numnodes(bt)); if (avl_numnodes(&avl) % 2 == 0) { node = avl_first(&avl); data = zfs_btree_first(bt, &bt_idx); } else { node = avl_last(&avl); data = zfs_btree_last(bt, &bt_idx); } ASSERT3U(node->data, ==, *data); zfs_btree_remove_idx(bt, &bt_idx); avl_remove(&avl, node); if (avl_numnodes(&avl) == 0) { break; } node = avl_first(&avl); ASSERT3U(node->data, ==, *(uint64_t *)zfs_btree_first(bt, NULL)); node = avl_last(&avl); ASSERT3U(node->data, ==, *(uint64_t *)zfs_btree_last(bt, NULL)); } ASSERT3S(zfs_btree_numnodes(bt), ==, 0); void *avl_cookie = NULL; while ((node = avl_destroy_nodes(&avl, &avl_cookie)) != NULL) free(node); avl_destroy(&avl); return (0); } /* * This test uses an avl and btree, and continually processes new random * values. Each value is either removed or inserted, depending on whether * or not it is found in the tree. The test periodically checks that both * trees have the same data and does consistency checks. This stress * option can also be run on its own from the command line. */ static int stress_tree(zfs_btree_t *bt, char *why) { (void) why; avl_tree_t avl; int_node_t *node; struct timeval tp; time_t t0; int insertions = 0, removals = 0, iterations = 0; u_longlong_t max = 0, min = UINT64_MAX; (void) gettimeofday(&tp, NULL); t0 = tp.tv_sec; avl_create(&avl, avl_compare, sizeof (int_node_t), offsetof(int_node_t, node)); while (1) { zfs_btree_index_t bt_idx = {0}; avl_index_t avl_idx = {0}; uint64_t randval = random() % tree_limit; node = malloc(sizeof (*node)); if (node == NULL) { perror("malloc"); exit(EXIT_FAILURE); } node->data = randval; max = randval > max ? randval : max; min = randval < min ? randval : min; void *ret = avl_find(&avl, node, &avl_idx); if (ret == NULL) { insertions++; avl_insert(&avl, node, avl_idx); ASSERT3P(zfs_btree_find(bt, &randval, &bt_idx), ==, NULL); zfs_btree_add_idx(bt, &randval, &bt_idx); verify_node(&avl, bt, node); } else { removals++; verify_node(&avl, bt, ret); zfs_btree_remove(bt, &randval); avl_remove(&avl, ret); free(ret); free(node); } zfs_btree_verify(bt); iterations++; if (iterations % contents_frequency == 0) { verify_contents(&avl, bt); } zfs_btree_verify(bt); (void) gettimeofday(&tp, NULL); if (tp.tv_sec > t0 + stress_timeout) { fprintf(stderr, "insertions/removals: %u/%u\nmax/min: " "%llu/%llu\n", insertions, removals, max, min); break; } } void *avl_cookie = NULL; while ((node = avl_destroy_nodes(&avl, &avl_cookie)) != NULL) free(node); avl_destroy(&avl); if (stress_only) { zfs_btree_index_t *idx = NULL; - uint64_t *rv; - - while ((rv = zfs_btree_destroy_nodes(bt, &idx)) != NULL) + while (zfs_btree_destroy_nodes(bt, &idx) != NULL) ; zfs_btree_verify(bt); } return (0); } /* * Verify inserting a duplicate value will cause a crash. * Note: negative test; return of 0 is a failure. */ static int insert_duplicate(zfs_btree_t *bt) { - uint64_t *p, i = 23456; + uint64_t i = 23456; zfs_btree_index_t bt_idx = {0}; - if ((p = (uint64_t *)zfs_btree_find(bt, &i, &bt_idx)) != NULL) { + if (zfs_btree_find(bt, &i, &bt_idx) != NULL) { fprintf(stderr, "Found value in empty tree.\n"); return (0); } zfs_btree_add_idx(bt, &i, &bt_idx); - if ((p = (uint64_t *)zfs_btree_find(bt, &i, &bt_idx)) == NULL) { + if (zfs_btree_find(bt, &i, &bt_idx) == NULL) { fprintf(stderr, "Did not find expected value.\n"); return (0); } /* Crash on inserting a duplicate */ zfs_btree_add_idx(bt, &i, NULL); return (0); } /* * Verify removing a non-existent value will cause a crash. * Note: negative test; return of 0 is a failure. */ static int remove_missing(zfs_btree_t *bt) { - uint64_t *p, i = 23456; + uint64_t i = 23456; zfs_btree_index_t bt_idx = {0}; - if ((p = (uint64_t *)zfs_btree_find(bt, &i, &bt_idx)) != NULL) { + if (zfs_btree_find(bt, &i, &bt_idx) != NULL) { fprintf(stderr, "Found value in empty tree.\n"); return (0); } /* Crash removing a nonexistent entry */ zfs_btree_remove(bt, &i); return (0); } static int do_negative_test(zfs_btree_t *bt, char *test_name) { int rval = 0; struct rlimit rlim = {0}; (void) setrlimit(RLIMIT_CORE, &rlim); if (strcmp(test_name, "insert_duplicate") == 0) { rval = insert_duplicate(bt); } else if (strcmp(test_name, "remove_missing") == 0) { rval = remove_missing(bt); } /* * Return 0, since callers will expect non-zero return values for * these tests, and we should have crashed before getting here anyway. */ (void) fprintf(stderr, "Test: %s returned %d.\n", test_name, rval); return (0); } typedef struct btree_test { const char *name; int (*func)(zfs_btree_t *, char *); } btree_test_t; static btree_test_t test_table[] = { { "insert_find_remove", insert_find_remove }, { "find_without_index", find_without_index }, { "drain_tree", drain_tree }, { "stress_tree", stress_tree }, { NULL, NULL } }; int main(int argc, char *argv[]) { char *negative_test = NULL; int failed_tests = 0; struct timeval tp; zfs_btree_t bt; int c; while ((c = getopt(argc, argv, "c:l:n:r:st:")) != -1) { switch (c) { case 'c': contents_frequency = atoi(optarg); break; case 'l': tree_limit = atoi(optarg); break; case 'n': negative_test = optarg; break; case 'r': seed = atoi(optarg); break; case 's': stress_only = B_TRUE; break; case 't': stress_timeout = atoi(optarg); break; case 'h': default: usage(1); break; } } - argc -= optind; - argv += optind; - optind = 1; - if (seed == 0) { (void) gettimeofday(&tp, NULL); seed = tp.tv_sec; } srandom(seed); zfs_btree_init(); zfs_btree_create(&bt, zfs_btree_compare, sizeof (uint64_t)); /* * This runs the named negative test. None of them should * return, as they both cause crashes. */ if (negative_test) { return (do_negative_test(&bt, negative_test)); } fprintf(stderr, "Seed: %u\n", seed); /* * This is a stress test that does operations on a btree over the * requested timeout period, verifying them against identical * operations in an avl tree. */ if (stress_only != 0) { return (stress_tree(&bt, NULL)); } /* Do the positive tests */ btree_test_t *test = &test_table[0]; while (test->name) { int retval; - uint64_t *rv; char why[BUFSIZE] = {0}; zfs_btree_index_t *idx = NULL; (void) fprintf(stdout, "%-20s", test->name); retval = test->func(&bt, why); if (retval == 0) { (void) fprintf(stdout, "ok\n"); } else { (void) fprintf(stdout, "failed with %d\n", retval); if (strlen(why) != 0) (void) fprintf(stdout, "\t%s\n", why); why[0] = '\0'; failed_tests++; } /* Remove all the elements and re-verify the tree */ - while ((rv = zfs_btree_destroy_nodes(&bt, &idx)) != NULL) + while (zfs_btree_destroy_nodes(&bt, &idx) != NULL) ; zfs_btree_verify(&bt); test++; } zfs_btree_verify(&bt); zfs_btree_fini(); return (failed_tests); } diff --git a/tests/zfs-tests/cmd/ctime.c b/tests/zfs-tests/cmd/ctime.c index f0f3d526eb3e..0f5d81aea613 100644 --- a/tests/zfs-tests/cmd/ctime.c +++ b/tests/zfs-tests/cmd/ctime.c @@ -1,377 +1,376 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* * Copyright (c) 2013 by Delphix. All rights reserved. */ #include #include #ifndef __FreeBSD__ #include #endif #include #include #include #include #include #include #include #include #define ST_ATIME 0 #define ST_CTIME 1 #define ST_MTIME 2 #define ALL_MODE (mode_t)(S_IRWXU|S_IRWXG|S_IRWXO) typedef struct timetest { int type; const char *name; int (*func)(const char *pfile); } timetest_t; static char tfile[BUFSIZ] = { 0 }; /* * DESCRIPTION: * Verify time will be changed correctly after each operation. * * STRATEGY: * 1. Define time test array. * 2. Loop through each item in this array. * 3. Verify the time is changed after each operation. * */ static int get_file_time(const char *pfile, int what, time_t *ptr) { struct stat stat_buf; if (pfile == NULL || ptr == NULL) { return (-1); } if (stat(pfile, &stat_buf) == -1) { return (-1); } switch (what) { case ST_ATIME: *ptr = stat_buf.st_atime; return (0); case ST_CTIME: *ptr = stat_buf.st_ctime; return (0); case ST_MTIME: *ptr = stat_buf.st_mtime; return (0); default: return (-1); } } static ssize_t get_dirnamelen(const char *path) { const char *end = strrchr(path, '/'); return (end ? end - path : -1); } static int do_read(const char *pfile) { int fd, ret = 0; char buf[BUFSIZ] = { 0 }; if (pfile == NULL) { return (-1); } if ((fd = open(pfile, O_RDONLY, ALL_MODE)) == -1) { return (-1); } if (read(fd, buf, sizeof (buf)) == -1) { (void) fprintf(stderr, "read(%d, buf, %zd) failed with errno " "%d\n", fd, sizeof (buf), errno); (void) close(fd); return (1); } (void) close(fd); return (ret); } static int do_write(const char *pfile) { int fd, ret = 0; char buf[BUFSIZ] = "call function do_write()"; if (pfile == NULL) { return (-1); } if ((fd = open(pfile, O_WRONLY, ALL_MODE)) == -1) { return (-1); } if (write(fd, buf, strlen(buf)) == -1) { (void) fprintf(stderr, "write(%d, buf, %d) failed with errno " "%d\n", fd, (int)strlen(buf), errno); (void) close(fd); return (1); } (void) close(fd); return (ret); } static int do_link(const char *pfile) { int ret = 0; char link_file[BUFSIZ + 16] = { 0 }; if (pfile == NULL) { return (-1); } /* * Figure out source file directory name, and create * the link file in the same directory. */ (void) snprintf(link_file, sizeof (link_file), "%.*s/%s", (int)get_dirnamelen(pfile), pfile, "link_file"); if (link(pfile, link_file) == -1) { (void) fprintf(stderr, "link(%s, %s) failed with errno %d\n", pfile, link_file, errno); return (1); } (void) unlink(link_file); return (ret); } static int do_creat(const char *pfile) { int fd, ret = 0; if (pfile == NULL) { return (-1); } if ((fd = creat(pfile, ALL_MODE)) == -1) { (void) fprintf(stderr, "creat(%s, ALL_MODE) failed with errno " "%d\n", pfile, errno); return (1); } (void) close(fd); return (ret); } static int do_utime(const char *pfile) { int ret = 0; if (pfile == NULL) { return (-1); } /* * Times of the file are set to the current time */ if (utime(pfile, NULL) == -1) { (void) fprintf(stderr, "utime(%s, NULL) failed with errno " "%d\n", pfile, errno); return (1); } return (ret); } static int do_chmod(const char *pfile) { int ret = 0; if (pfile == NULL) { return (-1); } if (chmod(pfile, ALL_MODE) == -1) { (void) fprintf(stderr, "chmod(%s, ALL_MODE) failed with " "errno %d\n", pfile, errno); return (1); } return (ret); } static int do_chown(const char *pfile) { int ret = 0; if (pfile == NULL) { return (-1); } if (chown(pfile, getuid(), getgid()) == -1) { (void) fprintf(stderr, "chown(%s, %d, %d) failed with errno " "%d\n", pfile, (int)getuid(), (int)getgid(), errno); return (1); } return (ret); } #ifndef __FreeBSD__ static int do_xattr(const char *pfile) { int ret = 0; const char *value = "user.value"; if (pfile == NULL) { return (-1); } if (setxattr(pfile, "user.x", value, strlen(value), 0) == -1) { (void) fprintf(stderr, "setxattr(%s, %d, %d) failed with errno " "%d\n", pfile, (int)getuid(), (int)getgid(), errno); return (1); } return (ret); } #endif static void cleanup(void) { if ((strlen(tfile) != 0) && (access(tfile, F_OK) == 0)) { (void) unlink(tfile); } } static timetest_t timetest_table[] = { { ST_ATIME, "st_atime", do_read }, { ST_ATIME, "st_atime", do_utime }, { ST_MTIME, "st_mtime", do_creat }, { ST_MTIME, "st_mtime", do_write }, { ST_MTIME, "st_mtime", do_utime }, { ST_CTIME, "st_ctime", do_creat }, { ST_CTIME, "st_ctime", do_write }, { ST_CTIME, "st_ctime", do_chmod }, { ST_CTIME, "st_ctime", do_chown }, { ST_CTIME, "st_ctime", do_link }, { ST_CTIME, "st_ctime", do_utime }, #ifndef __FreeBSD__ { ST_CTIME, "st_ctime", do_xattr }, #endif }; #define NCOMMAND (sizeof (timetest_table) / sizeof (timetest_table[0])) int main(void) { int i, ret, fd; const char *penv[] = {"TESTDIR", "TESTFILE0"}; (void) atexit(cleanup); /* * Get the environment variable values. */ for (i = 0; i < sizeof (penv) / sizeof (char *); i++) { if ((penv[i] = getenv(penv[i])) == NULL) { (void) fprintf(stderr, "getenv(penv[%d])\n", i); return (1); } } (void) snprintf(tfile, sizeof (tfile), "%s/%s", penv[0], penv[1]); /* * If the test file exists, remove it first. */ if (access(tfile, F_OK) == 0) { (void) unlink(tfile); } - ret = 0; if ((fd = open(tfile, O_WRONLY | O_CREAT | O_TRUNC, ALL_MODE)) == -1) { (void) fprintf(stderr, "open(%s) failed: %d\n", tfile, errno); return (1); } (void) close(fd); for (i = 0; i < NCOMMAND; i++) { time_t t1, t2; /* * Get original time before operating. */ ret = get_file_time(tfile, timetest_table[i].type, &t1); if (ret != 0) { (void) fprintf(stderr, "get_file_time(%s %d) = %d\n", tfile, timetest_table[i].type, ret); return (1); } /* * Sleep 2 seconds, then invoke command on given file */ (void) sleep(2); timetest_table[i].func(tfile); /* * Get time after operating. */ ret = get_file_time(tfile, timetest_table[i].type, &t2); if (ret != 0) { (void) fprintf(stderr, "get_file_time(%s %d) = %d\n", tfile, timetest_table[i].type, ret); return (1); } if (t1 == t2) { (void) fprintf(stderr, "%s: t1(%ld) == t2(%ld)\n", timetest_table[i].name, (long)t1, (long)t2); return (1); } else { (void) fprintf(stderr, "%s: t1(%ld) != t2(%ld)\n", timetest_table[i].name, (long)t1, (long)t2); } } return (0); } diff --git a/tests/zfs-tests/cmd/draid.c b/tests/zfs-tests/cmd/draid.c index 46d7b4dcc69d..3e5ff59f7399 100644 --- a/tests/zfs-tests/cmd/draid.c +++ b/tests/zfs-tests/cmd/draid.c @@ -1,1409 +1,1408 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2018 Intel Corporation. * Copyright (c) 2020 by Lawrence Livermore National Security, LLC. */ #include #include #include #include #include #include /* * The number of rows to generate for new permutation maps. */ #define MAP_ROWS_DEFAULT 256 /* * Key values for dRAID maps when stored as nvlists. */ #define MAP_SEED "seed" #define MAP_CHECKSUM "checksum" #define MAP_WORST_RATIO "worst_ratio" #define MAP_AVG_RATIO "avg_ratio" #define MAP_CHILDREN "children" #define MAP_NPERMS "nperms" #define MAP_PERMS "perms" static void draid_usage(void) { (void) fprintf(stderr, "usage: draid command args ...\n" "Available commands are:\n" "\n" "\tdraid generate [-cv] [-m min] [-n max] [-p passes] FILE\n" "\tdraid verify [-rv] FILE\n" "\tdraid dump [-v] [-m min] [-n max] FILE\n" "\tdraid table FILE\n" "\tdraid merge FILE SRC SRC...\n"); exit(1); } static int read_map(const char *filename, nvlist_t **allcfgs) { int block_size = 131072; int buf_size = 131072; int tmp_size, error; char *tmp_buf; struct stat64 stat; if (lstat64(filename, &stat) != 0) return (errno); if (stat.st_size == 0 || !(S_ISREG(stat.st_mode) || S_ISLNK(stat.st_mode))) { return (EINVAL); } gzFile fp = gzopen(filename, "rb"); if (fp == Z_NULL) return (errno); char *buf = malloc(buf_size); if (buf == NULL) { (void) gzclose(fp); return (ENOMEM); } ssize_t rc, bytes = 0; while (!gzeof(fp)) { rc = gzread(fp, buf + bytes, block_size); if ((rc < 0) || (rc == 0 && !gzeof(fp))) { free(buf); (void) gzerror(fp, &error); (void) gzclose(fp); return (error); } else { bytes += rc; if (bytes + block_size >= buf_size) { tmp_size = 2 * buf_size; tmp_buf = malloc(tmp_size); if (tmp_buf == NULL) { free(buf); (void) gzclose(fp); return (ENOMEM); } memcpy(tmp_buf, buf, bytes); free(buf); buf = tmp_buf; buf_size = tmp_size; } } } (void) gzclose(fp); error = nvlist_unpack(buf, bytes, allcfgs, 0); free(buf); return (error); } /* * Read a map from the specified filename. A file contains multiple maps * which are indexed by the number of children. The caller is responsible * for freeing the configuration returned. */ static int read_map_key(const char *filename, const char *key, nvlist_t **cfg) { nvlist_t *allcfgs, *foundcfg = NULL; int error; error = read_map(filename, &allcfgs); if (error != 0) return (error); (void) nvlist_lookup_nvlist(allcfgs, key, &foundcfg); if (foundcfg != NULL) { nvlist_dup(foundcfg, cfg, KM_SLEEP); error = 0; } else { error = ENOENT; } nvlist_free(allcfgs); return (error); } /* * Write all mappings to the map file. */ static int write_map(const char *filename, nvlist_t *allcfgs) { size_t buflen = 0; int error; error = nvlist_size(allcfgs, &buflen, NV_ENCODE_XDR); if (error) return (error); char *buf = malloc(buflen); if (buf == NULL) return (ENOMEM); error = nvlist_pack(allcfgs, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP); if (error) { free(buf); return (error); } /* * Atomically update the file using a temporary file and the * traditional unlink then rename steps. This code provides * no locking, it only guarantees the packed nvlist on disk * is updated atomically and is internally consistent. */ char *tmpname = calloc(1, MAXPATHLEN); if (tmpname == NULL) { free(buf); return (ENOMEM); } snprintf(tmpname, MAXPATHLEN - 1, "%s.XXXXXX", filename); int fd = mkstemp(tmpname); if (fd < 0) { error = errno; free(buf); free(tmpname); return (error); } (void) close(fd); gzFile fp = gzopen(tmpname, "w9b"); if (fp == Z_NULL) { error = errno; free(buf); free(tmpname); return (errno); } ssize_t rc, bytes = 0; while (bytes < buflen) { size_t size = MIN(buflen - bytes, 131072); rc = gzwrite(fp, buf + bytes, size); if (rc < 0) { free(buf); (void) gzerror(fp, &error); (void) gzclose(fp); (void) unlink(tmpname); free(tmpname); return (error); } else if (rc == 0) { break; } else { bytes += rc; } } free(buf); (void) gzclose(fp); if (bytes != buflen) { (void) unlink(tmpname); free(tmpname); return (EIO); } /* * Unlink the previous config file and replace it with the updated * version. If we're able to unlink the file then directory is * writable by us and the subsequent rename should never fail. */ error = unlink(filename); if (error != 0 && errno != ENOENT) { error = errno; (void) unlink(tmpname); free(tmpname); return (error); } error = rename(tmpname, filename); if (error != 0) { error = errno; (void) unlink(tmpname); free(tmpname); return (error); } free(tmpname); return (0); } /* * Add the dRAID map to the file and write it out. */ static int write_map_key(const char *filename, char *key, draid_map_t *map, double worst_ratio, double avg_ratio) { nvlist_t *nv_cfg, *allcfgs; int error; /* * Add the configuration to an existing or new file. The new * configuration will replace an existing configuration with the * same key if it has a lower ratio and is therefore better. */ error = read_map(filename, &allcfgs); if (error == ENOENT) { allcfgs = fnvlist_alloc(); } else if (error != 0) { return (error); } error = nvlist_lookup_nvlist(allcfgs, key, &nv_cfg); if (error == 0) { uint64_t nv_cfg_worst_ratio = fnvlist_lookup_uint64(nv_cfg, MAP_WORST_RATIO); double nv_worst_ratio = (double)nv_cfg_worst_ratio / 1000.0; if (worst_ratio < nv_worst_ratio) { /* Replace old map with the more balanced new map. */ fnvlist_remove(allcfgs, key); } else { /* The old map is preferable, keep it. */ nvlist_free(allcfgs); return (EEXIST); } } nvlist_t *cfg = fnvlist_alloc(); fnvlist_add_uint64(cfg, MAP_SEED, map->dm_seed); fnvlist_add_uint64(cfg, MAP_CHECKSUM, map->dm_checksum); fnvlist_add_uint64(cfg, MAP_CHILDREN, map->dm_children); fnvlist_add_uint64(cfg, MAP_NPERMS, map->dm_nperms); fnvlist_add_uint8_array(cfg, MAP_PERMS, map->dm_perms, map->dm_children * map->dm_nperms * sizeof (uint8_t)); fnvlist_add_uint64(cfg, MAP_WORST_RATIO, (uint64_t)(worst_ratio * 1000.0)); fnvlist_add_uint64(cfg, MAP_AVG_RATIO, (uint64_t)(avg_ratio * 1000.0)); error = nvlist_add_nvlist(allcfgs, key, cfg); if (error == 0) error = write_map(filename, allcfgs); nvlist_free(cfg); nvlist_free(allcfgs); return (error); } static void dump_map(draid_map_t *map, const char *key, double worst_ratio, double avg_ratio, int verbose) { if (verbose == 0) { return; } else if (verbose == 1) { printf(" \"%s\": seed: 0x%016llx worst_ratio: %2.03f " "avg_ratio: %2.03f\n", key, (u_longlong_t)map->dm_seed, worst_ratio, avg_ratio); return; } else { printf(" \"%s\":\n" " seed: 0x%016llx\n" " checksum: 0x%016llx\n" " worst_ratio: %2.03f\n" " avg_ratio: %2.03f\n" " children: %llu\n" " nperms: %llu\n", key, (u_longlong_t)map->dm_seed, (u_longlong_t)map->dm_checksum, worst_ratio, avg_ratio, (u_longlong_t)map->dm_children, (u_longlong_t)map->dm_nperms); if (verbose > 2) { printf(" perms = {\n"); for (int i = 0; i < map->dm_nperms; i++) { printf(" { "); for (int j = 0; j < map->dm_children; j++) { printf("%3d%s ", map->dm_perms[ i * map->dm_children + j], j < map->dm_children - 1 ? "," : ""); } printf(" },\n"); } printf(" }\n"); } else if (verbose == 2) { printf(" draid_perms = \n"); } } } static void dump_map_nv(const char *key, nvlist_t *cfg, int verbose) { draid_map_t map; uint_t c; uint64_t worst_ratio = fnvlist_lookup_uint64(cfg, MAP_WORST_RATIO); uint64_t avg_ratio = fnvlist_lookup_uint64(cfg, MAP_AVG_RATIO); map.dm_seed = fnvlist_lookup_uint64(cfg, MAP_SEED); map.dm_checksum = fnvlist_lookup_uint64(cfg, MAP_CHECKSUM); map.dm_children = fnvlist_lookup_uint64(cfg, MAP_CHILDREN); map.dm_nperms = fnvlist_lookup_uint64(cfg, MAP_NPERMS); map.dm_perms = fnvlist_lookup_uint8_array(cfg, MAP_PERMS, &c); dump_map(&map, key, (double)worst_ratio / 1000.0, avg_ratio / 1000.0, verbose); } /* * Print a summary of the mapping. */ static int dump_map_key(const char *filename, const char *key, int verbose) { nvlist_t *cfg; int error; error = read_map_key(filename, key, &cfg); if (error != 0) return (error); dump_map_nv(key, cfg, verbose); return (0); } /* * Allocate a new permutation map for evaluation. */ static int alloc_new_map(uint64_t children, uint64_t nperms, uint64_t seed, draid_map_t **mapp) { draid_map_t *map; int error; map = malloc(sizeof (draid_map_t)); if (map == NULL) return (ENOMEM); map->dm_children = children; map->dm_nperms = nperms; map->dm_seed = seed; map->dm_checksum = 0; error = vdev_draid_generate_perms(map, &map->dm_perms); if (error) { free(map); return (error); } *mapp = map; return (0); } /* * Allocate the fixed permutation map for N children. */ static int alloc_fixed_map(uint64_t children, draid_map_t **mapp) { const draid_map_t *fixed_map; draid_map_t *map; int error; error = vdev_draid_lookup_map(children, &fixed_map); if (error) return (error); map = malloc(sizeof (draid_map_t)); if (map == NULL) return (ENOMEM); memcpy(map, fixed_map, sizeof (draid_map_t)); VERIFY3U(map->dm_checksum, !=, 0); error = vdev_draid_generate_perms(map, &map->dm_perms); if (error) { free(map); return (error); } *mapp = map; return (0); } /* * Free a permutation map. */ static void free_map(draid_map_t *map) { free(map->dm_perms); free(map); } /* * Check if dev is in the provided list of faulted devices. */ static inline boolean_t is_faulted(int *faulted_devs, int nfaulted, int dev) { for (int i = 0; i < nfaulted; i++) if (faulted_devs[i] == dev) return (B_TRUE); return (B_FALSE); } /* * Evaluate how resilvering I/O will be distributed given a list of faulted * vdevs. As a simplification we assume one IO is sufficient to repair each * damaged device in a group. */ static double eval_resilver(draid_map_t *map, uint64_t groupwidth, uint64_t nspares, int *faulted_devs, int nfaulted, int *min_child_ios, int *max_child_ios) { uint64_t children = map->dm_children; uint64_t ngroups = 1; uint64_t ndisks = children - nspares; /* * Calculate the minimum number of groups required to fill a slice. */ while (ngroups * (groupwidth) % (children - nspares) != 0) ngroups++; int *ios = calloc(map->dm_children, sizeof (uint64_t)); ASSERT3P(ios, !=, NULL); /* Resilver all rows */ for (int i = 0; i < map->dm_nperms; i++) { uint8_t *row = &map->dm_perms[i * map->dm_children]; /* Resilver all groups with faulted drives */ for (int j = 0; j < ngroups; j++) { uint64_t spareidx = map->dm_children - nspares; boolean_t repair_needed = B_FALSE; /* See if any devices in this group are faulted */ uint64_t groupstart = (j * groupwidth) % ndisks; for (int k = 0; k < groupwidth; k++) { uint64_t groupidx = (groupstart + k) % ndisks; repair_needed = is_faulted(faulted_devs, nfaulted, row[groupidx]); if (repair_needed) break; } if (repair_needed == B_FALSE) continue; /* * This group is degraded. Calculate the number of * reads the non-faulted drives require and the number * of writes to the distributed hot spare for this row. */ for (int k = 0; k < groupwidth; k++) { uint64_t groupidx = (groupstart + k) % ndisks; if (!is_faulted(faulted_devs, nfaulted, row[groupidx])) { ios[row[groupidx]]++; } else if (nspares > 0) { while (is_faulted(faulted_devs, nfaulted, row[spareidx])) { spareidx++; } ASSERT3U(spareidx, <, map->dm_children); ios[row[spareidx]]++; spareidx++; } } } } *min_child_ios = INT_MAX; *max_child_ios = 0; /* * Find the drives with fewest and most required I/O. These values * are used to calculate the imbalance ratio. To avoid returning an * infinite value for permutations which have children that perform * no IO a floor of 1 IO per child is set. This ensures a meaningful * ratio is returned for comparison and it is not an uncommon when * there are a large number of children. */ for (int i = 0; i < map->dm_children; i++) { if (is_faulted(faulted_devs, nfaulted, i)) { ASSERT0(ios[i]); continue; } if (ios[i] == 0) ios[i] = 1; if (ios[i] < *min_child_ios) *min_child_ios = ios[i]; if (ios[i] > *max_child_ios) *max_child_ios = ios[i]; } ASSERT3S(*min_child_ios, !=, INT_MAX); ASSERT3S(*max_child_ios, !=, 0); double ratio = (double)(*max_child_ios) / (double)(*min_child_ios); free(ios); return (ratio); } /* * Evaluate the quality of the permutation mapping by considering possible * device failures. Returns the imbalance ratio for the worst mapping which * is defined to be the largest number of child IOs over the fewest number * child IOs. A value of 1.0 indicates the mapping is perfectly balance and * all children perform an equal amount of work during reconstruction. */ static void eval_decluster(draid_map_t *map, double *worst_ratiop, double *avg_ratiop) { uint64_t children = map->dm_children; double worst_ratio = 1.0; double sum = 0; int worst_min_ios = 0, worst_max_ios = 0; int n = 0; /* * When there are only 2 children there can be no distributed * spare and no resilver to evaluate. Default to a ratio of 1.0 * for this degenerate case. */ if (children == VDEV_DRAID_MIN_CHILDREN) { *worst_ratiop = 1.0; *avg_ratiop = 1.0; return; } /* * Score the mapping as if it had either 1 or 2 distributed spares. */ for (int nspares = 1; nspares <= 2; nspares++) { uint64_t faults = nspares; /* * Score groupwidths up to 19. This value was chosen as the * largest reasonable width (16d+3p). dRAID pools may be still * be created with wider stripes but they are not considered in * this analysis in order to optimize for the most common cases. */ for (uint64_t groupwidth = 2; groupwidth <= MIN(children - nspares, 19); groupwidth++) { int faulted_devs[2]; int min_ios, max_ios; /* * Score possible devices faults. This is limited * to exactly one fault per distributed spare for * the purposes of this similation. */ for (int f1 = 0; f1 < children; f1++) { faulted_devs[0] = f1; double ratio; if (faults == 1) { ratio = eval_resilver(map, groupwidth, nspares, faulted_devs, faults, &min_ios, &max_ios); if (ratio > worst_ratio) { worst_ratio = ratio; worst_min_ios = min_ios; worst_max_ios = max_ios; } sum += ratio; n++; } else if (faults == 2) { for (int f2 = f1 + 1; f2 < children; f2++) { faulted_devs[1] = f2; ratio = eval_resilver(map, groupwidth, nspares, faulted_devs, faults, &min_ios, &max_ios); if (ratio > worst_ratio) { worst_ratio = ratio; worst_min_ios = min_ios; worst_max_ios = max_ios; } sum += ratio; n++; } } } } } *worst_ratiop = worst_ratio; *avg_ratiop = sum / n; /* * Log the min/max io values for particularly unbalanced maps. * Since the maps are generated entirely randomly these are possible * be exceedingly unlikely. We log it for possible investigation. */ if (worst_ratio > 100.0) { dump_map(map, "DEBUG", worst_ratio, *avg_ratiop, 2); printf("worst_min_ios=%d worst_max_ios=%d\n", worst_min_ios, worst_max_ios); } } static int eval_maps(uint64_t children, int passes, uint64_t *map_seed, draid_map_t **best_mapp, double *best_ratiop, double *avg_ratiop) { draid_map_t *best_map = NULL; double best_worst_ratio = 1000.0; double best_avg_ratio = 1000.0; /* * Perform the requested number of passes evaluating randomly * generated permutation maps. Only the best version is kept. */ for (int i = 0; i < passes; i++) { double worst_ratio, avg_ratio; draid_map_t *map; int error; /* * Calculate the next seed and generate a new candidate map. */ error = alloc_new_map(children, MAP_ROWS_DEFAULT, vdev_draid_rand(map_seed), &map); if (error) { if (best_map != NULL) free_map(best_map); return (error); } /* * Consider maps with a lower worst_ratio to be of higher * quality. Some maps may have a lower avg_ratio but they * are discarded since they might include some particularly * imbalanced permutations. The average is tracked to in * order to get a sense of the average permutation quality. */ eval_decluster(map, &worst_ratio, &avg_ratio); if (best_map == NULL || worst_ratio < best_worst_ratio) { if (best_map != NULL) free_map(best_map); best_map = map; best_worst_ratio = worst_ratio; best_avg_ratio = avg_ratio; } else { free_map(map); } } /* * After determining the best map generate a checksum over the full * permutation array. This checksum is verified when opening a dRAID * pool to ensure the generated in memory permutations are correct. */ zio_cksum_t cksum; fletcher_4_native_varsize(best_map->dm_perms, sizeof (uint8_t) * best_map->dm_children * best_map->dm_nperms, &cksum); best_map->dm_checksum = cksum.zc_word[0]; *best_mapp = best_map; *best_ratiop = best_worst_ratio; *avg_ratiop = best_avg_ratio; return (0); } static int draid_generate(int argc, char *argv[]) { char filename[MAXPATHLEN] = {0}; uint64_t map_seed[2]; int c, fd, error, verbose = 0, passes = 1, continuous = 0; int min_children = VDEV_DRAID_MIN_CHILDREN; int max_children = VDEV_DRAID_MAX_CHILDREN; int restarts = 0; while ((c = getopt(argc, argv, ":cm:n:p:v")) != -1) { switch (c) { case 'c': continuous++; break; case 'm': min_children = (int)strtol(optarg, NULL, 0); if (min_children < VDEV_DRAID_MIN_CHILDREN) { (void) fprintf(stderr, "A minimum of 2 " "children are required.\n"); return (1); } break; case 'n': max_children = (int)strtol(optarg, NULL, 0); if (max_children > VDEV_DRAID_MAX_CHILDREN) { (void) fprintf(stderr, "A maximum of %d " "children are allowed.\n", VDEV_DRAID_MAX_CHILDREN); return (1); } break; case 'p': passes = (int)strtol(optarg, NULL, 0); break; case 'v': /* * 0 - Only log when a better map is added to the file. * 1 - Log the current best map for each child count. * Minimal output on a single summary line. * 2 - Log the current best map for each child count. * More verbose includes most map fields. * 3 - Log the current best map for each child count. * Very verbose all fields including the full map. */ verbose++; break; case ':': (void) fprintf(stderr, "missing argument for '%c' option\n", optopt); draid_usage(); break; case '?': (void) fprintf(stderr, "invalid option '%c'\n", optopt); draid_usage(); break; } } if (argc > optind) strlcpy(filename, argv[optind], sizeof (filename)); else { (void) fprintf(stderr, "A FILE must be specified.\n"); return (1); } restart: /* * Start with a fresh seed from /dev/urandom. */ fd = open("/dev/urandom", O_RDONLY); if (fd < 0) { printf("Unable to open /dev/urandom: %s\n:", strerror(errno)); return (1); } else { ssize_t bytes = sizeof (map_seed); ssize_t bytes_read = 0; while (bytes_read < bytes) { ssize_t rc = read(fd, ((char *)map_seed) + bytes_read, bytes - bytes_read); if (rc < 0) { printf("Unable to read /dev/urandom: %s\n:", strerror(errno)); close(fd); return (1); } bytes_read += rc; } (void) close(fd); } if (restarts == 0) printf("Writing generated mappings to '%s':\n", filename); /* * Generate maps for all requested child counts. The best map for * each child count is written out to the specified file. If the file * already contains a better mapping this map will not be added. */ for (uint64_t children = min_children; children <= max_children; children++) { char key[8] = { 0 }; draid_map_t *map; double worst_ratio = 1000.0; double avg_ratio = 1000.0; error = eval_maps(children, passes, map_seed, &map, &worst_ratio, &avg_ratio); if (error) { printf("Error eval_maps(): %s\n", strerror(error)); return (1); } if (worst_ratio < 1.0 || avg_ratio < 1.0) { printf("Error ratio < 1.0: worst_ratio = %2.03f " "avg_ratio = %2.03f\n", worst_ratio, avg_ratio); return (1); } snprintf(key, 7, "%llu", (u_longlong_t)children); error = write_map_key(filename, key, map, worst_ratio, avg_ratio); if (error == 0) { /* The new map was added to the file. */ dump_map(map, key, worst_ratio, avg_ratio, MAX(verbose, 1)); } else if (error == EEXIST) { /* The existing map was preferable and kept. */ if (verbose > 0) dump_map_key(filename, key, verbose); } else { printf("Error write_map_key(): %s\n", strerror(error)); return (1); } free_map(map); } /* * When the continuous option is set restart at the minimum number of * children instead of exiting. This option is useful as a mechanism * to continuous try and refine the discovered permutations. */ if (continuous) { restarts++; printf("Restarting by request (-c): %d\n", restarts); goto restart; } return (0); } /* * Verify each map in the file by generating its in-memory permutation array * and comfirming its checksum is correct. */ static int draid_verify(int argc, char *argv[]) { char filename[MAXPATHLEN] = {0}; int n = 0, c, error, verbose = 1; int check_ratios = 0; while ((c = getopt(argc, argv, ":rv")) != -1) { switch (c) { case 'r': check_ratios++; break; case 'v': verbose++; break; case ':': (void) fprintf(stderr, "missing argument for '%c' option\n", optopt); draid_usage(); break; case '?': (void) fprintf(stderr, "invalid option '%c'\n", optopt); draid_usage(); break; } } if (argc > optind) { char *abspath = malloc(MAXPATHLEN); if (abspath == NULL) return (ENOMEM); if (realpath(argv[optind], abspath) != NULL) strlcpy(filename, abspath, sizeof (filename)); else strlcpy(filename, argv[optind], sizeof (filename)); free(abspath); } else { (void) fprintf(stderr, "A FILE must be specified.\n"); return (1); } printf("Verifying permutation maps: '%s'\n", filename); /* * Lookup hardcoded permutation map for each valid number of children * and verify a generated map has the correct checksum. Then compare * the generated map values with the nvlist map values read from the * reference file to cross-check the permutation. */ for (uint64_t children = VDEV_DRAID_MIN_CHILDREN; children <= VDEV_DRAID_MAX_CHILDREN; children++) { draid_map_t *map; char key[8] = {0}; snprintf(key, 8, "%llu", (u_longlong_t)children); error = alloc_fixed_map(children, &map); if (error) { printf("Error alloc_fixed_map() failed: %s\n", error == ECKSUM ? "Invalid checksum" : strerror(error)); return (1); } uint64_t nv_seed, nv_checksum, nv_children, nv_nperms; uint8_t *nv_perms; nvlist_t *cfg; uint_t c; error = read_map_key(filename, key, &cfg); if (error != 0) { printf("Error read_map_key() failed: %s\n", strerror(error)); free_map(map); return (1); } nv_seed = fnvlist_lookup_uint64(cfg, MAP_SEED); nv_checksum = fnvlist_lookup_uint64(cfg, MAP_CHECKSUM); nv_children = fnvlist_lookup_uint64(cfg, MAP_CHILDREN); nv_nperms = fnvlist_lookup_uint64(cfg, MAP_NPERMS); nvlist_lookup_uint8_array(cfg, MAP_PERMS, &nv_perms, &c); /* * Compare draid_map_t and nvlist reference values. */ if (map->dm_seed != nv_seed) { printf("Error different seeds: 0x%016llx != " "0x%016llx\n", (u_longlong_t)map->dm_seed, (u_longlong_t)nv_seed); error = EINVAL; } if (map->dm_checksum != nv_checksum) { printf("Error different checksums: 0x%016llx " "!= 0x%016llx\n", (u_longlong_t)map->dm_checksum, (u_longlong_t)nv_checksum); error = EINVAL; } if (map->dm_children != nv_children) { printf("Error different children: %llu " "!= %llu\n", (u_longlong_t)map->dm_children, (u_longlong_t)nv_children); error = EINVAL; } if (map->dm_nperms != nv_nperms) { printf("Error different nperms: %llu " "!= %llu\n", (u_longlong_t)map->dm_nperms, (u_longlong_t)nv_nperms); error = EINVAL; } for (uint64_t i = 0; i < nv_children * nv_nperms; i++) { if (map->dm_perms[i] != nv_perms[i]) { printf("Error different perms[%llu]: " "%d != %d\n", (u_longlong_t)i, (int)map->dm_perms[i], (int)nv_perms[i]); error = EINVAL; break; } } /* * For good measure recalculate the worst and average * ratios and confirm they match the nvlist values. */ if (check_ratios) { uint64_t nv_worst_ratio, nv_avg_ratio; double worst_ratio, avg_ratio; eval_decluster(map, &worst_ratio, &avg_ratio); nv_worst_ratio = fnvlist_lookup_uint64(cfg, MAP_WORST_RATIO); nv_avg_ratio = fnvlist_lookup_uint64(cfg, MAP_AVG_RATIO); if (worst_ratio < 1.0 || avg_ratio < 1.0) { printf("Error ratio out of range %2.03f, " "%2.03f\n", worst_ratio, avg_ratio); error = EINVAL; } if ((uint64_t)(worst_ratio * 1000.0) != nv_worst_ratio) { printf("Error different worst_ratio %2.03f " "!= %2.03f\n", (double)nv_worst_ratio / 1000.0, worst_ratio); error = EINVAL; } if ((uint64_t)(avg_ratio * 1000.0) != nv_avg_ratio) { printf("Error different average_ratio %2.03f " "!= %2.03f\n", (double)nv_avg_ratio / 1000.0, avg_ratio); error = EINVAL; } } if (error) { free_map(map); nvlist_free(cfg); return (1); } if (verbose > 0) { printf("- %llu children: good\n", (u_longlong_t)children); } n++; free_map(map); nvlist_free(cfg); } if (n != (VDEV_DRAID_MAX_CHILDREN - 1)) { printf("Error permutation maps missing: %d / %d checked\n", n, VDEV_DRAID_MAX_CHILDREN - 1); return (1); } printf("Successfully verified %d / %d permutation maps\n", n, VDEV_DRAID_MAX_CHILDREN - 1); return (0); } /* * Dump the contents of the specified mapping(s) for inspection. */ static int draid_dump(int argc, char *argv[]) { char filename[MAXPATHLEN] = {0}; int c, error, verbose = 1; int min_children = VDEV_DRAID_MIN_CHILDREN; int max_children = VDEV_DRAID_MAX_CHILDREN; while ((c = getopt(argc, argv, ":vm:n:")) != -1) { switch (c) { case 'm': min_children = (int)strtol(optarg, NULL, 0); if (min_children < 2) { (void) fprintf(stderr, "A minimum of 2 " "children are required.\n"); return (1); } break; case 'n': max_children = (int)strtol(optarg, NULL, 0); if (max_children > VDEV_DRAID_MAX_CHILDREN) { (void) fprintf(stderr, "A maximum of %d " "children are allowed.\n", VDEV_DRAID_MAX_CHILDREN); return (1); } break; case 'v': verbose++; break; case ':': (void) fprintf(stderr, "missing argument for '%c' option\n", optopt); draid_usage(); break; case '?': (void) fprintf(stderr, "invalid option '%c'\n", optopt); draid_usage(); break; } } if (argc > optind) strlcpy(filename, argv[optind], sizeof (filename)); else { (void) fprintf(stderr, "A FILE must be specified.\n"); return (1); } /* * Dump maps for the requested child counts. */ for (uint64_t children = min_children; children <= max_children; children++) { char key[8] = { 0 }; snprintf(key, 7, "%llu", (u_longlong_t)children); error = dump_map_key(filename, key, verbose); if (error) { printf("Error dump_map_key(): %s\n", strerror(error)); return (1); } } return (0); } /* * Print all of the mappings as a C formatted draid_map_t array. This table * is found in the module/zcommon/zfs_draid.c file and is the definitive * source for all mapping used by dRAID. It cannot be updated without * changing the dRAID on disk format. */ static int draid_table(int argc, char *argv[]) { char filename[MAXPATHLEN] = {0}; int error; if (argc > optind) strlcpy(filename, argv[optind], sizeof (filename)); else { (void) fprintf(stderr, "A FILE must be specified.\n"); return (1); } printf("static const draid_map_t " "draid_maps[VDEV_DRAID_MAX_MAPS] = {\n"); for (uint64_t children = VDEV_DRAID_MIN_CHILDREN; children <= VDEV_DRAID_MAX_CHILDREN; children++) { uint64_t seed, checksum, nperms, avg_ratio; nvlist_t *cfg; char key[8] = {0}; snprintf(key, 8, "%llu", (u_longlong_t)children); error = read_map_key(filename, key, &cfg); if (error != 0) { printf("Error read_map_key() failed: %s\n", strerror(error)); return (1); } seed = fnvlist_lookup_uint64(cfg, MAP_SEED); checksum = fnvlist_lookup_uint64(cfg, MAP_CHECKSUM); children = fnvlist_lookup_uint64(cfg, MAP_CHILDREN); nperms = fnvlist_lookup_uint64(cfg, MAP_NPERMS); avg_ratio = fnvlist_lookup_uint64(cfg, MAP_AVG_RATIO); printf("\t{ %3llu, %3llu, 0x%016llx, 0x%016llx },\t" "/* %2.03f */\n", (u_longlong_t)children, (u_longlong_t)nperms, (u_longlong_t)seed, (u_longlong_t)checksum, (double)avg_ratio / 1000.0); nvlist_free(cfg); } printf("};\n"); return (0); } static int draid_merge_impl(nvlist_t *allcfgs, const char *srcfilename, int *mergedp) { nvlist_t *srccfgs; nvpair_t *elem = NULL; int error, merged = 0; error = read_map(srcfilename, &srccfgs); if (error != 0) return (error); while ((elem = nvlist_next_nvpair(srccfgs, elem)) != NULL) { uint64_t nv_worst_ratio; uint64_t allcfg_worst_ratio; nvlist_t *cfg, *allcfg; char *key; switch (nvpair_type(elem)) { case DATA_TYPE_NVLIST: (void) nvpair_value_nvlist(elem, &cfg); key = nvpair_name(elem); nv_worst_ratio = fnvlist_lookup_uint64(cfg, MAP_WORST_RATIO); error = nvlist_lookup_nvlist(allcfgs, key, &allcfg); if (error == 0) { allcfg_worst_ratio = fnvlist_lookup_uint64( allcfg, MAP_WORST_RATIO); if (nv_worst_ratio < allcfg_worst_ratio) { fnvlist_remove(allcfgs, key); - error = nvlist_add_nvlist(allcfgs, - key, cfg); + fnvlist_add_nvlist(allcfgs, key, cfg); merged++; } } else if (error == ENOENT) { - error = nvlist_add_nvlist(allcfgs, key, cfg); + fnvlist_add_nvlist(allcfgs, key, cfg); merged++; } else { return (error); } break; default: continue; } } nvlist_free(srccfgs); *mergedp = merged; return (0); } /* * Merge the best map for each child count found in the listed files into * a new file. This allows 'draid generate' to be run in parallel and for * the results maps to be combined. */ static int draid_merge(int argc, char *argv[]) { char filename[MAXPATHLEN] = {0}; int c, error, total_merged = 0; nvlist_t *allcfgs; while ((c = getopt(argc, argv, ":")) != -1) { switch (c) { case ':': (void) fprintf(stderr, "missing argument for '%c' option\n", optopt); draid_usage(); break; case '?': (void) fprintf(stderr, "invalid option '%c'\n", optopt); draid_usage(); break; } } if (argc < 4) { (void) fprintf(stderr, "A FILE and multiple SRCs must be specified.\n"); return (1); } strlcpy(filename, argv[optind], sizeof (filename)); optind++; error = read_map(filename, &allcfgs); if (error == ENOENT) { allcfgs = fnvlist_alloc(); } else if (error != 0) { printf("Error read_map(): %s\n", strerror(error)); return (error); } while (optind < argc) { char srcfilename[MAXPATHLEN] = {0}; int merged = 0; strlcpy(srcfilename, argv[optind], sizeof (srcfilename)); error = draid_merge_impl(allcfgs, srcfilename, &merged); if (error) { printf("Error draid_merge_impl(): %s\n", strerror(error)); nvlist_free(allcfgs); return (1); } total_merged += merged; printf("Merged %d key(s) from '%s' into '%s'\n", merged, srcfilename, filename); optind++; } if (total_merged > 0) write_map(filename, allcfgs); printf("Merged a total of %d key(s) into '%s'\n", total_merged, filename); nvlist_free(allcfgs); return (0); } int main(int argc, char *argv[]) { if (argc < 2) draid_usage(); char *subcommand = argv[1]; if (strcmp(subcommand, "generate") == 0) { return (draid_generate(argc - 1, argv + 1)); } else if (strcmp(subcommand, "verify") == 0) { return (draid_verify(argc - 1, argv + 1)); } else if (strcmp(subcommand, "dump") == 0) { return (draid_dump(argc - 1, argv + 1)); } else if (strcmp(subcommand, "table") == 0) { return (draid_table(argc - 1, argv + 1)); } else if (strcmp(subcommand, "merge") == 0) { return (draid_merge(argc - 1, argv + 1)); } else { draid_usage(); } } diff --git a/tests/zfs-tests/cmd/mkbusy.c b/tests/zfs-tests/cmd/mkbusy.c index cc4a6cfcb98c..78860381d880 100644 --- a/tests/zfs-tests/cmd/mkbusy.c +++ b/tests/zfs-tests/cmd/mkbusy.c @@ -1,167 +1,163 @@ /* * This file and its contents are supplied under the terms of the * Common Development and Distribution License ("CDDL"), version 1.0. * You may only use this file in accordance with the terms of version * 1.0 of the CDDL. * * A full copy of the text of the CDDL should have accompanied this * source. A copy of the CDDL is also available via the Internet at * http://www.illumos.org/license/CDDL. */ /* * Copyright (c) 2012 by Delphix. All rights reserved. */ /* * Make a directory busy. If the argument is an existing file or directory, * simply open it directly and pause. If not, verify that the parent directory * exists, and create a new file in that directory. */ #include #include #include #include #include #include #include #include #include #include static __attribute__((noreturn)) void usage(const char *progname) { (void) fprintf(stderr, "Usage: %s \n", progname); exit(1); } static __attribute__((noreturn)) void fail(const char *err) { perror(err); exit(1); } static void daemonize(void) { pid_t pid; if ((pid = fork()) < 0) { fail("fork"); } else if (pid != 0) { (void) fprintf(stdout, "%ld\n", (long)pid); exit(0); } (void) setsid(); (void) close(0); (void) close(1); (void) close(2); } static const char * get_basename(const char *path) { const char *bn = strrchr(path, '/'); return (bn ? bn + 1 : path); } static ssize_t get_dirnamelen(const char *path) { const char *end = strrchr(path, '/'); return (end ? end - path : -1); } int main(int argc, char *argv[]) { int c; boolean_t isdir = B_FALSE; struct stat sbuf; char *fpath = NULL; char *prog = argv[0]; while ((c = getopt(argc, argv, "")) != -1) { switch (c) { default: usage(prog); } } argc -= optind; argv += optind; if (argc != 1) usage(prog); if (stat(argv[0], &sbuf) != 0) { char *arg; const char *dname, *fname; size_t arglen; ssize_t dnamelen; /* * The argument supplied doesn't exist. Copy the path, and * remove the trailing slash if present. */ if ((arg = strdup(argv[0])) == NULL) fail("strdup"); arglen = strlen(arg); if (arg[arglen - 1] == '/') arg[arglen - 1] = '\0'; /* Get the directory and file names. */ fname = get_basename(arg); dname = arg; if ((dnamelen = get_dirnamelen(arg)) != -1) arg[dnamelen] = '\0'; else dname = "."; /* The directory portion of the path must exist */ if (stat(dname, &sbuf) != 0 || !(sbuf.st_mode & S_IFDIR)) usage(prog); if (asprintf(&fpath, "%s/%s", dname, fname) == -1) fail("asprintf"); free(arg); } else switch (sbuf.st_mode & S_IFMT) { case S_IFDIR: isdir = B_TRUE; zfs_fallthrough; case S_IFLNK: case S_IFCHR: case S_IFBLK: if ((fpath = strdup(argv[0])) == NULL) fail("strdup"); break; default: usage(prog); } if (!isdir) { - int fd; - - if ((fd = open(fpath, O_CREAT | O_RDWR, 0600)) < 0) + if (open(fpath, O_CREAT | O_RDWR, 0600) < 0) fail("open"); } else { - DIR *dp; - - if ((dp = opendir(fpath)) == NULL) + if (opendir(fpath) == NULL) fail("opendir"); } free(fpath); daemonize(); (void) pause(); return (0); } diff --git a/tests/zfs-tests/cmd/user_ns_exec.c b/tests/zfs-tests/cmd/user_ns_exec.c index 86593622399e..d781301473a9 100644 --- a/tests/zfs-tests/cmd/user_ns_exec.c +++ b/tests/zfs-tests/cmd/user_ns_exec.c @@ -1,179 +1,178 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ #include #include #include #include #include #include #include #include #include #include #include #include #define EXECSHELL "/bin/sh" #define UIDMAP "0 100000 65536" static int child_main(int argc, char *argv[], int sync_pipe) { char sync_buf; char cmds[BUFSIZ] = { 0 }; char sep[] = " "; int i, len; if (unshare(CLONE_NEWUSER | CLONE_NEWNS) != 0) { perror("unshare"); return (1); } /* tell parent we entered the new namespace */ if (write(sync_pipe, "1", 1) != 1) { perror("write"); return (1); } /* wait for parent to setup the uid mapping */ if (read(sync_pipe, &sync_buf, 1) != 1) { (void) fprintf(stderr, "user namespace setup failed\n"); return (1); } close(sync_pipe); if (setuid(0) != 0) { perror("setuid"); return (1); } if (setgid(0) != 0) { perror("setgid"); return (1); } len = 0; for (i = 1; i < argc; i++) { (void) snprintf(cmds+len, sizeof (cmds)-len, "%s%s", argv[i], sep); len += strlen(argv[i]) + strlen(sep); } if (execl(EXECSHELL, "sh", "-c", cmds, (char *)NULL) != 0) { perror("execl: " EXECSHELL); return (1); } return (0); } static int set_idmap(pid_t pid, const char *file) { int result = 0; int mapfd; char path[PATH_MAX]; (void) snprintf(path, sizeof (path), "/proc/%d/%s", (int)pid, file); mapfd = open(path, O_WRONLY); if (mapfd < 0) { - result = errno; perror("open"); return (errno); } if (write(mapfd, UIDMAP, sizeof (UIDMAP)-1) != sizeof (UIDMAP)-1) { perror("write"); result = (errno); } close(mapfd); return (result); } int main(int argc, char *argv[]) { char sync_buf; int result, wstatus; int syncfd[2]; pid_t child; if (argc < 2 || strlen(argv[1]) == 0) { (void) printf("\tUsage: %s ...\n", argv[0]); return (1); } if (socketpair(AF_UNIX, SOCK_STREAM, 0, syncfd) != 0) { perror("socketpair"); return (1); } child = fork(); if (child == (pid_t)-1) { perror("fork"); return (1); } if (child == 0) { close(syncfd[0]); return (child_main(argc, argv, syncfd[1])); } close(syncfd[1]); result = 0; /* wait for the child to have unshared its namespaces */ if (read(syncfd[0], &sync_buf, 1) != 1) { perror("read"); kill(child, SIGKILL); result = 1; goto reap; } /* write uid mapping */ if (set_idmap(child, "uid_map") != 0 || set_idmap(child, "gid_map") != 0) { result = 1; kill(child, SIGKILL); goto reap; } /* tell the child to proceed */ if (write(syncfd[0], "1", 1) != 1) { perror("write"); kill(child, SIGKILL); result = 1; goto reap; } close(syncfd[0]); reap: while (waitpid(child, &wstatus, 0) != child) kill(child, SIGKILL); if (result == 0) result = WEXITSTATUS(wstatus); return (result); }